From 58e69a030695500a2b38f0e09250f0c5e04e4c05 Mon Sep 17 00:00:00 2001
From: Ying-Jer Kao <yingjerkao@gmail.com>
Date: Thu, 26 Sep 2024 13:06:01 +0900
Subject: [PATCH 1/6] Replace magma with cusolver LU

---
 .../linalg_internal_gpu/cuDet_internal.cu     | 180 +++++++++++-------
 1 file changed, 108 insertions(+), 72 deletions(-)

diff --git a/src/backend/linalg_internal_gpu/cuDet_internal.cu b/src/backend/linalg_internal_gpu/cuDet_internal.cu
index fee31c49..7cf02265 100644
--- a/src/backend/linalg_internal_gpu/cuDet_internal.cu
+++ b/src/backend/linalg_internal_gpu/cuDet_internal.cu
@@ -5,152 +5,188 @@
 
 #include "../utils_internal_gpu/cuAlloc_gpu.hpp"
 
-#ifdef UNI_MAGMA
-  #include "magma_v2.h"
-#endif
-
 namespace cytnx {
 
   namespace linalg_internal {
 
     void cuDet_internal_cd(void* out, const boost::intrusive_ptr<Storage_base>& in,
                            const cytnx_uint64& L) {
-#ifdef UNI_MAGMA
       cytnx_complex128* od = (cytnx_complex128*)out;  // result on cpu!
       cuDoubleComplex* _in = (cuDoubleComplex*)utils_internal::cuMalloc_gpu(
         in->len * sizeof(cuDoubleComplex));  // unify mem.
       checkCudaErrors(
         cudaMemcpy(_in, in->Mem, sizeof(cytnx_complex128) * in->len, cudaMemcpyDeviceToDevice));
 
-      magma_int_t* ipiv;
-      magma_imalloc_cpu(&ipiv, L + 1);
-      magma_int_t N = L;
-      magma_int_t info;
-      magma_zgetrf_gpu(N, N, _in, N, ipiv, &info);
-      cytnx_error_msg(info != 0, "[ERROR] magma_zgetrf_gpu fail with info= %d\n", info);
+      cusolverDnHandle_t cusolverH;
+      cusolverDnCreate(&cusolverH);
+
+      int* devIpiv;
+      int* devInfo;
+      checkCudaErrors(cudaMalloc((void**)&devIpiv, L * sizeof(int)));
+      checkCudaErrors(cudaMalloc((void**)&devInfo, sizeof(int)));
+
+      int workspace_size = 0;
+      cuDoubleComplex* workspace = NULL;
+      cusolverDnZgetrf_bufferSize(cusolverH, L, L, _in, L, &workspace_size);
+      checkCudaErrors(cudaMalloc((void**)&workspace, workspace_size * sizeof(cuDoubleComplex)));
+
+      cusolverDnZgetrf(cusolverH, L, L, _in, L, workspace, devIpiv, devInfo);
+
+      int info;
+      checkCudaErrors(cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
+      cytnx_error_msg(info != 0, "[ERROR] cusolverDnZgetrf fail with info= %d\n", info);
 
       // since we do unify mem, direct access element is possible:
       od[0] = 1;
       bool neg = 0;
-      for (magma_int_t i = 0; i < N; i++) {
-        od[0] *= ((cytnx_complex128*)_in)[i * N + i];
+      int* ipiv = new int[L];
+      checkCudaErrors(cudaMemcpy(ipiv, devIpiv, L * sizeof(int), cudaMemcpyDeviceToHost));
+      for (int i = 0; i < L; i++) {
+        od[0] *= ((cytnx_complex128*)_in)[i * L + i];
         if (ipiv[i] != (i + 1)) neg = !neg;
       }
-      magma_free_cpu(ipiv);
+      delete[] ipiv;
+      cudaFree(devIpiv);
+      cudaFree(devInfo);
+      cudaFree(workspace);
       cudaFree(_in);
+      cusolverDnDestroy(cusolverH);
       if (neg) od[0] *= -1;
-
-#else
-      cytnx_error_msg(true,
-                      "[ERROR][internal Det] Det for Tensor on GPU require magma. please "
-                      "re-compiling cytnx with magma.%s",
-                      "\n");
-#endif
     }
 
     void cuDet_internal_cf(void* out, const boost::intrusive_ptr<Storage_base>& in,
                            const cytnx_uint64& L) {
-#ifdef UNI_MAGMA
       cytnx_complex64* od = (cytnx_complex64*)out;  // result on cpu!
       cuFloatComplex* _in = (cuFloatComplex*)utils_internal::cuMalloc_gpu(
         in->len * sizeof(cuFloatComplex));  // unify mem.
       checkCudaErrors(
         cudaMemcpy(_in, in->Mem, sizeof(cytnx_complex64) * in->len, cudaMemcpyDeviceToDevice));
 
-      magma_int_t* ipiv;
-      magma_imalloc_cpu(&ipiv, L + 1);
-      magma_int_t N = L;
-      magma_int_t info;
-      magma_cgetrf_gpu(N, N, _in, N, ipiv, &info);
-      cytnx_error_msg(info != 0, "[ERROR] magma_cgetrf_gpu fail with info= %d\n", info);
+      cusolverDnHandle_t cusolverH;
+      cusolverDnCreate(&cusolverH);
+
+      int* devIpiv;
+      int* devInfo;
+      checkCudaErrors(cudaMalloc((void**)&devIpiv, L * sizeof(int)));
+      checkCudaErrors(cudaMalloc((void**)&devInfo, sizeof(int)));
+
+      int workspace_size = 0;
+      cuFloatComplex* workspace = NULL;
+      cusolverDnCgetrf_bufferSize(cusolverH, L, L, _in, L, &workspace_size);
+      checkCudaErrors(cudaMalloc((void**)&workspace, workspace_size * sizeof(cuFloatComplex)));
+
+      cusolverDnCgetrf(cusolverH, L, L, _in, L, workspace, devIpiv, devInfo);
+
+      int info;
+      checkCudaErrors(cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
+      cytnx_error_msg(info != 0, "[ERROR] cusolverDnCgetrf fail with info= %d\n", info);
 
       // since we do unify mem, direct access element is possible:
       od[0] = 1;
       bool neg = 0;
-      for (magma_int_t i = 0; i < N; i++) {
-        od[0] *= ((cytnx_complex64*)_in)[i * N + i];
+      int* ipiv = new int[L];
+      checkCudaErrors(cudaMemcpy(ipiv, devIpiv, L * sizeof(int), cudaMemcpyDeviceToHost));
+      for (int i = 0; i < L; i++) {
+        od[0] *= ((cytnx_complex64*)_in)[i * L + i];
         if (ipiv[i] != (i + 1)) neg = !neg;
       }
-      magma_free_cpu(ipiv);
+      delete[] ipiv;
+      cudaFree(devIpiv);
+      cudaFree(devInfo);
+      cudaFree(workspace);
       cudaFree(_in);
+      cusolverDnDestroy(cusolverH);
       if (neg) od[0] *= -1;
-
-#else
-      cytnx_error_msg(true,
-                      "[ERROR][internal Det] Det for Tensor on GPU require magma. please "
-                      "re-compiling cytnx with magma.%s",
-                      "\n");
-#endif
     }
 
     void cuDet_internal_d(void* out, const boost::intrusive_ptr<Storage_base>& in,
                           const cytnx_uint64& L) {
-#ifdef UNI_MAGMA
       cytnx_double* od = (cytnx_double*)out;  // result on cpu!
       cytnx_double* _in =
         (cytnx_double*)utils_internal::cuMalloc_gpu(in->len * sizeof(cytnx_double));  // unify mem.
       checkCudaErrors(
         cudaMemcpy(_in, in->Mem, sizeof(cytnx_double) * in->len, cudaMemcpyDeviceToDevice));
 
-      magma_int_t* ipiv;
-      magma_imalloc_cpu(&ipiv, L + 1);
-      magma_int_t N = L;
-      magma_int_t info;
-      magma_dgetrf_gpu(N, N, _in, N, ipiv, &info);
-      cytnx_error_msg(info != 0, "[ERROR] magma_dgetrf_gpu fail with info= %d\n", info);
+      cusolverDnHandle_t cusolverH;
+      cusolverDnCreate(&cusolverH);
+
+      int* devIpiv;
+      int* devInfo;
+      checkCudaErrors(cudaMalloc((void**)&devIpiv, L * sizeof(int)));
+      checkCudaErrors(cudaMalloc((void**)&devInfo, sizeof(int)));
+
+      int workspace_size = 0;
+      cytnx_double* workspace = NULL;
+      cusolverDnDgetrf_bufferSize(cusolverH, L, L, _in, L, &workspace_size);
+      checkCudaErrors(cudaMalloc((void**)&workspace, workspace_size * sizeof(cytnx_double)));
+
+      cusolverDnDgetrf(cusolverH, L, L, _in, L, workspace, devIpiv, devInfo);
+
+      int info;
+      checkCudaErrors(cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
+      cytnx_error_msg(info != 0, "[ERROR] cusolverDnDgetrf fail with info= %d\n", info);
 
       // since we do unify mem, direct access element is possible:
       od[0] = 1;
       bool neg = 0;
-      for (magma_int_t i = 0; i < N; i++) {
-        od[0] *= _in[i * N + i];
+      int* ipiv = new int[L];
+      checkCudaErrors(cudaMemcpy(ipiv, devIpiv, L * sizeof(int), cudaMemcpyDeviceToHost));
+      for (int i = 0; i < L; i++) {
+        od[0] *= _in[i * L + i];
         if (ipiv[i] != (i + 1)) neg = !neg;
       }
-      magma_free_cpu(ipiv);
+      delete[] ipiv;
+      cudaFree(devIpiv);
+      cudaFree(devInfo);
+      cudaFree(workspace);
       cudaFree(_in);
+      cusolverDnDestroy(cusolverH);
       if (neg) od[0] *= -1;
-
-#else
-      cytnx_error_msg(true,
-                      "[ERROR][internal Det] Det for Tensor on GPU require magma. please "
-                      "re-compiling cytnx with magma.%s",
-                      "\n");
-#endif
     }
 
     void cuDet_internal_f(void* out, const boost::intrusive_ptr<Storage_base>& in,
                           const cytnx_uint64& L) {
-#ifdef UNI_MAGMA
       cytnx_float* od = (cytnx_float*)out;  // result on cpu!
       cytnx_float* _in =
         (cytnx_float*)utils_internal::cuMalloc_gpu(in->len * sizeof(cytnx_float));  // unify mem.
       checkCudaErrors(
         cudaMemcpy(_in, in->Mem, sizeof(cytnx_float) * in->len, cudaMemcpyDeviceToDevice));
 
-      magma_int_t* ipiv;
-      magma_imalloc_cpu(&ipiv, L + 1);
-      magma_int_t N = L;
-      magma_int_t info;
-      magma_sgetrf_gpu(N, N, _in, N, ipiv, &info);
-      cytnx_error_msg(info != 0, "[ERROR] magma_sgetrf_gpu fail with info= %d\n", info);
+      cusolverDnHandle_t cusolverH;
+      cusolverDnCreate(&cusolverH);
+
+      int* devIpiv;
+      int* devInfo;
+      checkCudaErrors(cudaMalloc((void**)&devIpiv, L * sizeof(int)));
+      checkCudaErrors(cudaMalloc((void**)&devInfo, sizeof(int)));
+
+      int workspace_size = 0;
+      cytnx_float* workspace = NULL;
+      cusolverDnSgetrf_bufferSize(cusolverH, L, L, _in, L, &workspace_size);
+      checkCudaErrors(cudaMalloc((void**)&workspace, workspace_size * sizeof(cytnx_float)));
+
+      cusolverDnSgetrf(cusolverH, L, L, _in, L, workspace, devIpiv, devInfo);
+
+      int info;
+      checkCudaErrors(cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
+      cytnx_error_msg(info != 0, "[ERROR] cusolverDnSgetrf fail with info= %d\n", info);
 
       // since we do unify mem, direct access element is possible:
       od[0] = 1;
       bool neg = 0;
-      for (magma_int_t i = 0; i < N; i++) {
-        od[0] *= _in[i * N + i];
+      int* ipiv = new int[L];
+      checkCudaErrors(cudaMemcpy(ipiv, devIpiv, L * sizeof(int), cudaMemcpyDeviceToHost));
+      for (int i = 0; i < L; i++) {
+        od[0] *= _in[i * L + i];
         if (ipiv[i] != (i + 1)) neg = !neg;
       }
-      magma_free_cpu(ipiv);
+      delete[] ipiv;
+      cudaFree(devIpiv);
+      cudaFree(devInfo);
+      cudaFree(workspace);
       cudaFree(_in);
+      cusolverDnDestroy(cusolverH);
       if (neg) od[0] *= -1;
-
-#else
-      cytnx_error_msg(true,
-                      "[ERROR][internal Det] Det for Tensor on GPU require magma. please "
-                      "re-compiling cytnx with magma.%s",
-                      "\n");
-#endif
     }
 
   }  // namespace linalg_internal

From 63430180d19472ebc3e00b64d795cc7084dc8e34 Mon Sep 17 00:00:00 2001
From: Ying-Jer Kao <yingjerkao@gmail.com>
Date: Thu, 26 Sep 2024 13:16:44 +0900
Subject: [PATCH 2/6] Remove Magma Dependency

---
 cmake/Modules/FindCUQUANTUM.cmake         |  4 ++--
 src/Device.cpp                            | 16 +---------------
 src/backend/linalg_internal_interface.cpp | 20 +-------------------
 3 files changed, 4 insertions(+), 36 deletions(-)

diff --git a/cmake/Modules/FindCUQUANTUM.cmake b/cmake/Modules/FindCUQUANTUM.cmake
index 9df2b081..4f17d0b3 100644
--- a/cmake/Modules/FindCUQUANTUM.cmake
+++ b/cmake/Modules/FindCUQUANTUM.cmake
@@ -9,10 +9,10 @@
 #   CUQUANTUM_INCLUDE_DIRS        ... cutensor include directory
 #   CUQUANTUM_LIBRARIES           ... cutensor libraries
 #
-#   MAGMA_ROOT                this is required to set!
+#   CUQUANTUM_ROOT                this is required to set!
 #
 
-#If environment variable MAGMA_ROOT is specified, it has same effect as MAGMA_ROOT
+#If environment variable CUQUANTUM_ROOT is specified, it has same effect as CUQUANTUM_ROOT
 
 if(NOT DEFINED ENV{CUQUANTUM_ROOT} AND NOT DEFINED CUQUANTUM_ROOT)
   message(FATAL_ERROR "CUQUANTUM_ROOT not set!")
diff --git a/src/Device.cpp b/src/Device.cpp
index f7d3b176..8fa2c5b3 100644
--- a/src/Device.cpp
+++ b/src/Device.cpp
@@ -4,10 +4,6 @@
   #include <omp.h>
 #endif
 
-#ifdef UNI_MAGMA
-  #include "magma_v2.h"
-#endif
-
 using namespace std;
 namespace cytnx {
 
@@ -43,12 +39,6 @@ namespace cytnx {
       }
     }
 
-    // #ifdef UNI_MAGMA
-    //     int magma_status = magma_init();
-    //     cytnx_error_msg(magma_status!=MAGMA_SUCCESS,"[ERROR] magma system cannot
-    //     initialize!%s","\n");
-    // #endif
-
 #endif
 
 #ifdef UNI_OMP
@@ -62,11 +52,7 @@ namespace cytnx {
   };
 
   Device_class::~Device_class(){
-    // #ifdef UNI_MAGMA
-    //     int magma_status = magma_finalize();
-    //     cytnx_error_msg(magma_status!=MAGMA_SUCCESS,"[ERROR] magma system cannot
-    //     finalize!%s","\n");
-    // #endif
+
   };
 
   string Device_class::getname(const int& device_id) {
diff --git a/src/backend/linalg_internal_interface.cpp b/src/backend/linalg_internal_interface.cpp
index 0ef1e356..47f4e1e1 100644
--- a/src/backend/linalg_internal_interface.cpp
+++ b/src/backend/linalg_internal_interface.cpp
@@ -2,9 +2,6 @@
 #ifdef UNI_MKL
   #include <mkl.h>
 #endif
-#ifdef UNI_MAGMA
-  #include "magma_v2.h"
-#endif
 
 using namespace std;
 
@@ -29,27 +26,12 @@ namespace cytnx {
 #endif
       return code;
     }
-    linalg_internal_interface::~linalg_internal_interface() {
-#ifdef UNI_GPU
-  #ifdef UNI_MAGMA
-      int magma_status = magma_finalize();
-      cytnx_error_msg(magma_status != MAGMA_SUCCESS, "[ERROR] magma system cannot finalize!%s",
-                      "\n");
-  #endif
-#endif
-    }
+    linalg_internal_interface::~linalg_internal_interface() {}
     linalg_internal_interface::linalg_internal_interface() {
       mkl_code = -1;
 #ifdef UNI_MKL
       this->set_mkl_ilp64();
 #endif
-#ifdef UNI_GPU
-  #ifdef UNI_MAGMA
-      int magma_status = magma_init();
-      cytnx_error_msg(magma_status != MAGMA_SUCCESS, "[ERROR] magma system cannot initialize!%s",
-                      "\n");
-  #endif
-#endif
 
       Ari_ii = vector<vector<Arithmeticfunc_oii>>(N_Type, vector<Arithmeticfunc_oii>(N_Type, NULL));
 

From fe24972c2c9b7ce52f311b09432b69ed7c4d0ec6 Mon Sep 17 00:00:00 2001
From: Ying-Jer Kao <yingjerkao@gmail.com>
Date: Thu, 26 Sep 2024 13:17:39 +0900
Subject: [PATCH 3/6] Implement Eig using cusolver

---
 .../linalg_internal_gpu/cuEig_internal.cu     | 262 +++++++++---------
 1 file changed, 132 insertions(+), 130 deletions(-)

diff --git a/src/backend/linalg_internal_gpu/cuEig_internal.cu b/src/backend/linalg_internal_gpu/cuEig_internal.cu
index 3fc65bd4..35442cf9 100644
--- a/src/backend/linalg_internal_gpu/cuEig_internal.cu
+++ b/src/backend/linalg_internal_gpu/cuEig_internal.cu
@@ -5,10 +5,6 @@
 
 #include "cuAlloc_gpu.hpp"
 
-#ifdef UNI_MAGMA
-  #include "magma_v2.h"
-#endif
-
 namespace cytnx {
 
   namespace linalg_internal {
@@ -17,155 +13,161 @@ namespace cytnx {
     void cuEig_internal_cd(const boost::intrusive_ptr<Storage_base> &in,
                            boost::intrusive_ptr<Storage_base> &e,
                            boost::intrusive_ptr<Storage_base> &v, const cytnx_int64 &L) {
-      /*
-      char jobs = 'N';
-
-      cytnx_complex128 *tA;
-      cytnx_complex128 *buffer_A =
-        (cytnx_complex128 *)malloc(cytnx_uint64(L) * L * sizeof(cytnx_complex128));
-      memcpy(buffer_A, in->Mem, sizeof(cytnx_complex128) * cytnx_uint64(L) * L);
+      cusolverDnHandle_t cusolverH = NULL;
+      cudaStream_t stream = NULL;
+      cusolverDnCreate(&cusolverH);
+      cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+      cusolverDnSetStream(cusolverH, stream);
+
+      cuDoubleComplex *d_A, *d_W, *d_V;
+      cudaMalloc((void **)&d_A, sizeof(cuDoubleComplex) * L * L);
+      cudaMalloc((void **)&d_W, sizeof(cuDoubleComplex) * L);
+      cudaMemcpy(d_A, in->Mem, sizeof(cuDoubleComplex) * L * L, cudaMemcpyHostToDevice);
+
+      int lwork = 0;
+      cusolverDnZgeev_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR, L, d_A, L, d_W, d_V, L, NULL,
+                                 1, &lwork);
+      cuDoubleComplex *d_work;
+      cudaMalloc((void **)&d_work, sizeof(cuDoubleComplex) * lwork);
+
+      int *devInfo;
+      cudaMalloc((void **)&devInfo, sizeof(int));
+
+      cusolverDnZgeev(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_OP_N, L, d_A, L, d_W, d_V, L,
+                      NULL, 1, d_work, lwork, devInfo);
+
+      cudaMemcpy(e->Mem, d_W, sizeof(cuDoubleComplex) * L, cudaMemcpyDeviceToHost);
       if (v->dtype != Type.Void) {
-        tA = (cytnx_complex128 *)v->Mem;
-        jobs = 'V';
+        cudaMemcpy(v->Mem, d_V, sizeof(cuDoubleComplex) * L * L, cudaMemcpyDeviceToHost);
       }
 
-      lapack_int ldA = L;
-      lapack_int info;
-      lapack_int ONE = 1;
-
-      info = LAPACKE_zgeev(LAPACK_COL_MAJOR, jobs, 'N', L, (lapack_complex_double *)buffer_A, ldA,
-                           (lapack_complex_double *)e->Mem, (lapack_complex_double *)tA, L, nullptr,
-                           ONE);
-
-      cytnx_error_msg(info != 0, "%s %d", "Error in Lapack function 'zgeev': Lapack INFO = ", info);
-
-      free(buffer_A);
-      */
+      cudaFree(d_A);
+      cudaFree(d_W);
+      cudaFree(d_V);
+      cudaFree(d_work);
+      cudaFree(devInfo);
+      cusolverDnDestroy(cusolverH);
+      cudaStreamDestroy(stream);
     }
+
     void cuEig_internal_cf(const boost::intrusive_ptr<Storage_base> &in,
                            boost::intrusive_ptr<Storage_base> &e,
                            boost::intrusive_ptr<Storage_base> &v, const cytnx_int64 &L) {
-      /*
-      char jobs = 'N';
-
-      cytnx_complex64 *tA;
-      cytnx_complex64 *buffer_A =
-        (cytnx_complex64 *)malloc(cytnx_uint64(L) * L * sizeof(cytnx_complex64));
-      memcpy(buffer_A, in->Mem, sizeof(cytnx_complex64) * cytnx_uint64(L) * L);
+      cusolverDnHandle_t cusolverH = NULL;
+      cudaStream_t stream = NULL;
+      cusolverDnCreate(&cusolverH);
+      cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+      cusolverDnSetStream(cusolverH, stream);
+
+      cuFloatComplex *d_A, *d_W, *d_V;
+      cudaMalloc((void **)&d_A, sizeof(cuFloatComplex) * L * L);
+      cudaMalloc((void **)&d_W, sizeof(cuFloatComplex) * L);
+      cudaMemcpy(d_A, in->Mem, sizeof(cuFloatComplex) * L * L, cudaMemcpyHostToDevice);
+
+      int lwork = 0;
+      cusolverDnCgeev_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR, L, d_A, L, d_W, d_V, L, NULL,
+                                 1, &lwork);
+      cuFloatComplex *d_work;
+      cudaMalloc((void **)&d_work, sizeof(cuFloatComplex) * lwork);
+
+      int *devInfo;
+      cudaMalloc((void **)&devInfo, sizeof(int));
+
+      cusolverDnCgeev(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_OP_N, L, d_A, L, d_W, d_V, L,
+                      NULL, 1, d_work, lwork, devInfo);
+
+      cudaMemcpy(e->Mem, d_W, sizeof(cuFloatComplex) * L, cudaMemcpyDeviceToHost);
       if (v->dtype != Type.Void) {
-        tA = (cytnx_complex64 *)v->Mem;
-        jobs = 'V';
+        cudaMemcpy(v->Mem, d_V, sizeof(cuFloatComplex) * L * L, cudaMemcpyDeviceToHost);
       }
 
-      lapack_int ldA = L;
-      lapack_int info;
-      lapack_int ONE = 1;
-
-      info =
-        LAPACKE_cgeev(LAPACK_COL_MAJOR, jobs, 'N', L, (lapack_complex_float *)buffer_A, ldA,
-                      (lapack_complex_float *)e->Mem, (lapack_complex_float *)tA, L, nullptr, ONE);
-
-      cytnx_error_msg(info != 0, "%s %d", "Error in Lapack function 'cgeev': Lapack INFO = ", info);
-
-      free(buffer_A);
-      */
+      cudaFree(d_A);
+      cudaFree(d_W);
+      cudaFree(d_V);
+      cudaFree(d_work);
+      cudaFree(devInfo);
+      cusolverDnDestroy(cusolverH);
+      cudaStreamDestroy(stream);
     }
 
     void cuEig_internal_d(const boost::intrusive_ptr<Storage_base> &in,
                           boost::intrusive_ptr<Storage_base> &e,
                           boost::intrusive_ptr<Storage_base> &v, const cytnx_int64 &L) {
-      /*
-      char jobs = 'N';
-
-      cytnx_double *tA;
-      cytnx_double *buffer_A = (cytnx_double*)malloc(cytnx_uint64(L)*L*sizeof(cytnx_double));
-      memcpy(buffer_A,in->Mem,sizeof(cytnx_double)*cytnx_uint64(L)*L);
-      cytnx_double *e_real = (cytnx_double*)malloc(cytnx_uint64(L)*sizeof(cytnx_double));
-      cytnx_double *e_imag = (cytnx_double*)malloc(cytnx_uint64(L)*sizeof(cytnx_double));
-
-      if(v->dtype!=Type.Void){
-          tA = (cytnx_double*)v->Mem;
-          jobs = 'V';
+      cusolverDnHandle_t cusolverH = NULL;
+      cudaStream_t stream = NULL;
+      cusolverDnCreate(&cusolverH);
+      cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+      cusolverDnSetStream(cusolverH, stream);
+
+      double *d_A, *d_W, *d_V;
+      cudaMalloc((void **)&d_A, sizeof(double) * L * L);
+      cudaMalloc((void **)&d_W, sizeof(double) * L);
+      cudaMemcpy(d_A, in->Mem, sizeof(double) * L * L, cudaMemcpyHostToDevice);
+
+      int lwork = 0;
+      cusolverDnDgeev_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR, L, d_A, L, d_W, d_V, L, NULL,
+                                 1, &lwork);
+      double *d_work;
+      cudaMalloc((void **)&d_work, sizeof(double) * lwork);
+
+      int *devInfo;
+      cudaMalloc((void **)&devInfo, sizeof(int));
+
+      cusolverDnDgeev(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_OP_N, L, d_A, L, d_W, d_V, L,
+                      NULL, 1, d_work, lwork, devInfo);
+
+      cudaMemcpy(e->Mem, d_W, sizeof(double) * L, cudaMemcpyDeviceToHost);
+      if (v->dtype != Type.Void) {
+        cudaMemcpy(v->Mem, d_V, sizeof(double) * L * L, cudaMemcpyDeviceToHost);
       }
 
-
-      lapack_int ldA = L;
-      lapack_int lwork = -1;
-      cytnx_double *rwork = (cytnx_double*)malloc(sizeof(cytnx_double)*2*L);
-      cytnx_double workspace = 0;
-      lapack_int info;
-      lapack_int ONE = 1;
-      lapack_int TWO = 2;
-      /// query lwork
-      dgeev(&jobs, (char*)"N", &L, buffer_A, &ldA, e_real, e_imag, tA, &L,nullptr, &ONE,&workspace,
-      &lwork,  &info);
-
-      cytnx_error_msg(info != 0, "%s %d", "Error in Lapack function 'dgeev': Lapack INFO = ", info);
-      lwork = lapack_int(workspace);
-      cytnx_double* work= (cytnx_double*)malloc(sizeof(cytnx_double)*lwork);
-      dgeev(&jobs, (char*)"N", &L, buffer_A, &ldA, e_real, e_imag, tA, &L,nullptr, &ONE,work,
-      &lwork,  &info);
-
-      cytnx_error_msg(info != 0, "%s %d", "Error in Lapack function 'dgeev': Lapack INFO = ", info);
-
-      // copy the real and imag to e output:
-      dcopy(L,e_real, &ONE, (cytnx_double*)e->Mem, &TWO);
-      dcopy(L,e_imag, &ONE, &(((cytnx_double*)e->Mem)[1]), &TWO);
-
-      free(work);
-      free(buffer_A);
-      free(rwork);
-      free(e_real);
-      free(e_imag);
-      */
+      cudaFree(d_A);
+      cudaFree(d_W);
+      cudaFree(d_V);
+      cudaFree(d_work);
+      cudaFree(devInfo);
+      cusolverDnDestroy(cusolverH);
+      cudaStreamDestroy(stream);
     }
+
     void cuEig_internal_f(const boost::intrusive_ptr<Storage_base> &in,
                           boost::intrusive_ptr<Storage_base> &e,
                           boost::intrusive_ptr<Storage_base> &v, const cytnx_int64 &L) {
-      /*
-      char jobs = 'N';
-
-      cytnx_float *tA;
-      cytnx_float *buffer_A = (cytnx_float*)malloc(cytnx_uint64(L)*L*sizeof(cytnx_float));
-      memcpy(buffer_A,in->Mem,sizeof(cytnx_float)*cytnx_uint64(L)*L);
-      cytnx_float *e_real = (cytnx_float*)malloc(cytnx_uint64(L)*sizeof(cytnx_float));
-      cytnx_float *e_imag = (cytnx_float*)malloc(cytnx_uint64(L)*sizeof(cytnx_float));
-
-      if(v->dtype!=Type.Void){
-          tA = (cytnx_float*)v->Mem;
-          jobs = 'V';
+      cusolverDnHandle_t cusolverH = NULL;
+      cudaStream_t stream = NULL;
+      cusolverDnCreate(&cusolverH);
+      cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+      cusolverDnSetStream(cusolverH, stream);
+
+      float *d_A, *d_W, *d_V;
+      cudaMalloc((void **)&d_A, sizeof(float) * L * L);
+      cudaMalloc((void **)&d_W, sizeof(float) * L);
+      cudaMemcpy(d_A, in->Mem, sizeof(float) * L * L, cudaMemcpyHostToDevice);
+
+      int lwork = 0;
+      cusolverDnSgeev_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR, L, d_A, L, d_W, d_V, L, NULL,
+                                 1, &lwork);
+      float *d_work;
+      cudaMalloc((void **)&d_work, sizeof(float) * lwork);
+
+      int *devInfo;
+      cudaMalloc((void **)&devInfo, sizeof(int));
+
+      cusolverDnSgeev(cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_OP_N, L, d_A, L, d_W, d_V, L,
+                      NULL, 1, d_work, lwork, devInfo);
+
+      cudaMemcpy(e->Mem, d_W, sizeof(float) * L, cudaMemcpyDeviceToHost);
+      if (v->dtype != Type.Void) {
+        cudaMemcpy(v->Mem, d_V, sizeof(float) * L * L, cudaMemcpyDeviceToHost);
       }
 
-
-      lapack_int ldA = L;
-      lapack_int lwork = -1;
-      cytnx_float *rwork = (cytnx_float*)malloc(sizeof(cytnx_float)*2*L);
-      cytnx_float workspace = 0;
-      lapack_int info;
-      lapack_int ONE = 1;
-      lapack_int TWO = 2;
-      /// query lwork
-      sgeev(&jobs, (char*)"N", &L, buffer_A, &ldA, e_real, e_imag, tA, &L,nullptr, &ONE,&workspace,
-      &lwork,  &info);
-
-      cytnx_error_msg(info != 0, "%s %d", "Error in Lapack function 'sgeev': Lapack INFO = ", info);
-      lwork = lapack_int(workspace);
-      cytnx_float* work= (cytnx_float*)malloc(sizeof(cytnx_float)*lwork);
-      sgeev(&jobs, (char*)"N", &L, buffer_A, &ldA, e_real, e_imag, tA, &L,nullptr, &ONE,work,
-      &lwork,  &info);
-
-      cytnx_error_msg(info != 0, "%s %d", "Error in Lapack function 'sgeev': Lapack INFO = ", info);
-
-      // copy the real and imag to e output:
-      scopy(L,e_real, &ONE, (cytnx_float*)e->Mem, &TWO);
-      scopy(L,e_imag, &ONE, &(((cytnx_float*)e->Mem)[1]), &TWO);
-
-      free(work);
-      free(buffer_A);
-      free(rwork);
-      free(e_real);
-      free(e_imag);
-      */
+      cudaFree(d_A);
+      cudaFree(d_W);
+      cudaFree(d_V);
+      cudaFree(d_work);
+      cudaFree(devInfo);
+      cusolverDnDestroy(cusolverH);
+      cudaStreamDestroy(stream);
     }
 
   }  // namespace linalg_internal

From 81dc6973b7634602b31b599e8cfc6c342e7142ea Mon Sep 17 00:00:00 2001
From: Ying-Jer Kao <yingjerkao@gmail.com>
Date: Thu, 26 Sep 2024 13:19:10 +0900
Subject: [PATCH 4/6] Remove Magma dependency

---
 CMakeLists.txt            |  1 -
 CytnxBKNDCMakeLists.cmake | 20 --------------------
 2 files changed, 21 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ed854be5..8cf0d85d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,7 +85,6 @@ option(BUILD_DOC "Build API docuemntation" OFF)
 option(USE_HPTT "Build Cytnx with HPTT" OFF)
 option(RUN_TESTS "Run Cytnx tests" OFF)
 option(USE_CUTT "Build Cytnx with CUTT" OFF)
-option(USE_MAGMA "Build Cytnx with MAGMA (requires CUDA)" ON)
 option(USE_CUTENSOR "Build Cytnx with CuTensor (requires CUDA)" ON)
 option(USE_CUQUANTUM "Build Cytnx with CUQuantum (requires CUDA)" ON)
 
diff --git a/CytnxBKNDCMakeLists.cmake b/CytnxBKNDCMakeLists.cmake
index 21213ed5..a4adbafb 100644
--- a/CytnxBKNDCMakeLists.cmake
+++ b/CytnxBKNDCMakeLists.cmake
@@ -169,26 +169,6 @@ if(USE_CUDA)
 
     endif()
 
-    if(USE_MAGMA)
-        find_package( MAGMA REQUIRED)
-        if(NOT MAGMA_FOUND)
-            message(FATAL_ERROR "MAGMA not found!")
-        endif()
-        message(STATUS "^^^magma root aft: ${MAGMA_ROOT}")
-        message(STATUS "^^^magma inc dr: ${MAGMA_INCLUDE_DIRS}")
-        message(STATUS "^^^magma lib dr: ${MAGMA_LIBRARY_DIRS}")
-        message(STATUS "^^^magma libs: ${MAGMA_LIBRARIES}")
-        #add_dependencies(cytnx magma)
-        target_include_directories(cytnx PRIVATE ${MAGMA_INCLUDE_DIRS})
-        target_compile_definitions(cytnx PRIVATE UNI_MAGMA)
-        target_link_libraries(cytnx PUBLIC ${MAGMA_LIBRARIES})
-
-        message( STATUS "Build with MAGMA: YES")
-        FILE(APPEND "${CMAKE_BINARY_DIR}/cxxflags.tmp" "-DUNI_MAGMA\n" "")
-        FILE(APPEND "${CMAKE_BINARY_DIR}/cxxflags.tmp" "-I${MAGMA_INCLUDE_DIRS}\n" "")
-        FILE(APPEND "${CMAKE_BINARY_DIR}/linkflags.tmp" "${MAGMA_LIBRARIES} -ldl\n" "") # use > to indicate special rt processing
-        message( STATUS "MAGMA: libdir:${MAGMA_LIBRARY_DIRS} incdir:${MAGMA_INCLUDE_DIRS} libs:${MAGMA_LIBRARIES}")
-    endif()
 
     message( STATUS " Build CUDA Support: YES")
     message( STATUS "  - CUDA Version: ${CUDA_VERSION_STRING}")

From 35c95aa2296d8a37afb56dab50dddbb38d04070d Mon Sep 17 00:00:00 2001
From: jeffry1829 <jeffry1829@gmail.com>
Date: Tue, 1 Oct 2024 16:23:02 +0800
Subject: [PATCH 5/6] fix (cu)Det error when U is singular & fix Det output
 device to gpu when original tensor is in gpu & add tests for Det, make gpu
 test for Directsum actually happens at gpu

---
 .../linalg_internal_cpu/Det_internal.cpp      |  20 ++-
 .../linalg_internal_gpu/cuDet_internal.cu     |  20 ++-
 src/linalg/Det.cpp                            |   4 +-
 tests/CMakeLists.txt                          |   1 +
 tests/gpu/CMakeLists.txt                      |   1 +
 tests/gpu/linalg_test/Det_test.cpp            | 161 ++++++++++++++++++
 tests/gpu/linalg_test/Directsum_test.cpp      |   2 +-
 tests/gpu/test_tools.h                        |   4 +-
 tests/linalg_test/Det_test.cpp                | 158 +++++++++++++++++
 9 files changed, 358 insertions(+), 13 deletions(-)
 create mode 100644 tests/gpu/linalg_test/Det_test.cpp
 create mode 100644 tests/linalg_test/Det_test.cpp

diff --git a/src/backend/linalg_internal_cpu/Det_internal.cpp b/src/backend/linalg_internal_cpu/Det_internal.cpp
index c507059e..c0bdfd14 100644
--- a/src/backend/linalg_internal_cpu/Det_internal.cpp
+++ b/src/backend/linalg_internal_cpu/Det_internal.cpp
@@ -27,8 +27,9 @@ namespace cytnx {
       lapack_int N = L;
       lapack_int info;
       info = LAPACKE_zgetrf(LAPACK_COL_MAJOR, N, N, _Rin, N, ipiv);
+      // If the info > 0, that means the U factor is exactly singular, and the det is 0.
       cytnx_error_msg(
-        info != 0, "%s %d",
+        info < 0, "%s %d",
         "[ERROR][Det_internal] Error in Lapack function 'zgetrf': Lapack INFO = ", info);
 
       // Whether lapack_complex_T is defined as std::complex<T> (c++ complex) or T _Complex
@@ -51,6 +52,8 @@ namespace cytnx {
       free(ipiv);
       free(_Rin);
       if (neg) od[0] *= -1;
+
+      if (info > 0) od[0] = 0;
     }
     void Det_internal_cf(void *out, const boost::intrusive_ptr<Storage_base> &Rin,
                          const cytnx_uint64 &L) {
@@ -63,8 +66,9 @@ namespace cytnx {
       lapack_int N = L;
       lapack_int info;
       info = LAPACKE_cgetrf(LAPACK_COL_MAJOR, N, N, _Rin, N, ipiv);
+      // If the info > 0, that means the U factor is exactly singular, and the det is 0.
       cytnx_error_msg(
-        info != 0, "%s %d",
+        info < 0, "%s %d",
         "[ERROR][Det_internal] Error in Lapack function 'cgetrf': Lapack INFO = ", info);
 
       // Whether lapack_complex_T is defined as std::complex<T> (c++ complex) or T _Complex
@@ -87,6 +91,8 @@ namespace cytnx {
       free(ipiv);
       free(_Rin);
       if (neg) od[0] *= -1;
+
+      if (info > 0) od[0] = 0;
     }
     void Det_internal_d(void *out, const boost::intrusive_ptr<Storage_base> &Rin,
                         const cytnx_uint64 &L) {
@@ -98,8 +104,9 @@ namespace cytnx {
       lapack_int N = L;
       lapack_int info;
       info = LAPACKE_dgetrf(LAPACK_COL_MAJOR, N, N, _Rin, N, ipiv);
+      // If the info > 0, that means the U factor is exactly singular, and the det is 0.
       cytnx_error_msg(
-        info != 0, "%s %d",
+        info < 0, "%s %d",
         "[ERROR][Det_internal] Error in Lapack function 'dgetrf': Lapack INFO = ", info);
       od[0] = 1;
       bool neg = 0;
@@ -110,6 +117,8 @@ namespace cytnx {
       free(ipiv);
       free(_Rin);
       if (neg) od[0] *= -1;
+
+      if (info > 0) od[0] = 0;
     }
     void Det_internal_f(void *out, const boost::intrusive_ptr<Storage_base> &Rin,
                         const cytnx_uint64 &L) {
@@ -121,8 +130,9 @@ namespace cytnx {
       lapack_int N = L;
       lapack_int info;
       info = LAPACKE_sgetrf(LAPACK_COL_MAJOR, N, N, _Rin, N, ipiv);
+      // If the info > 0, that means the U factor is exactly singular, and the det is 0.
       cytnx_error_msg(
-        info != 0, "%s %d",
+        info < 0, "%s %d",
         "[ERROR][Det_internal] Error in Lapack function 'sgetrf': Lapack INFO = ", info);
       od[0] = 1;
       bool neg = 0;
@@ -133,6 +143,8 @@ namespace cytnx {
       free(ipiv);
       free(_Rin);
       if (neg) od[0] *= -1;
+
+      if (info > 0) od[0] = 0;
     }
 
   }  // namespace linalg_internal
diff --git a/src/backend/linalg_internal_gpu/cuDet_internal.cu b/src/backend/linalg_internal_gpu/cuDet_internal.cu
index 7cf02265..5d8a5c81 100644
--- a/src/backend/linalg_internal_gpu/cuDet_internal.cu
+++ b/src/backend/linalg_internal_gpu/cuDet_internal.cu
@@ -34,7 +34,8 @@ namespace cytnx {
 
       int info;
       checkCudaErrors(cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
-      cytnx_error_msg(info != 0, "[ERROR] cusolverDnZgetrf fail with info= %d\n", info);
+      // If the info > 0, that means the U factor is exactly singular, and the det is 0.
+      cytnx_error_msg(info < 0, "[ERROR] cusolverDnZgetrf fail with info= %d\n", info);
 
       // since we do unify mem, direct access element is possible:
       od[0] = 1;
@@ -52,6 +53,8 @@ namespace cytnx {
       cudaFree(_in);
       cusolverDnDestroy(cusolverH);
       if (neg) od[0] *= -1;
+
+      if (info > 0) od[0] = 0;
     }
 
     void cuDet_internal_cf(void* out, const boost::intrusive_ptr<Storage_base>& in,
@@ -79,7 +82,8 @@ namespace cytnx {
 
       int info;
       checkCudaErrors(cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
-      cytnx_error_msg(info != 0, "[ERROR] cusolverDnCgetrf fail with info= %d\n", info);
+      // If the info > 0, that means the U factor is exactly singular, and the det is 0.
+      cytnx_error_msg(info < 0, "[ERROR] cusolverDnCgetrf fail with info= %d\n", info);
 
       // since we do unify mem, direct access element is possible:
       od[0] = 1;
@@ -97,6 +101,8 @@ namespace cytnx {
       cudaFree(_in);
       cusolverDnDestroy(cusolverH);
       if (neg) od[0] *= -1;
+
+      if (info > 0) od[0] = 0;
     }
 
     void cuDet_internal_d(void* out, const boost::intrusive_ptr<Storage_base>& in,
@@ -124,7 +130,8 @@ namespace cytnx {
 
       int info;
       checkCudaErrors(cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
-      cytnx_error_msg(info != 0, "[ERROR] cusolverDnDgetrf fail with info= %d\n", info);
+      // If the info > 0, that means the U factor is exactly singular, and the det is 0.
+      cytnx_error_msg(info < 0, "[ERROR] cusolverDnDgetrf fail with info= %d\n", info);
 
       // since we do unify mem, direct access element is possible:
       od[0] = 1;
@@ -142,6 +149,8 @@ namespace cytnx {
       cudaFree(_in);
       cusolverDnDestroy(cusolverH);
       if (neg) od[0] *= -1;
+
+      if (info > 0) od[0] = 0;
     }
 
     void cuDet_internal_f(void* out, const boost::intrusive_ptr<Storage_base>& in,
@@ -169,7 +178,8 @@ namespace cytnx {
 
       int info;
       checkCudaErrors(cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
-      cytnx_error_msg(info != 0, "[ERROR] cusolverDnSgetrf fail with info= %d\n", info);
+      // If the info > 0, that means the U factor is exactly singular, and the det is 0.
+      cytnx_error_msg(info < 0, "[ERROR] cusolverDnSgetrf fail with info= %d\n", info);
 
       // since we do unify mem, direct access element is possible:
       od[0] = 1;
@@ -187,6 +197,8 @@ namespace cytnx {
       cudaFree(_in);
       cusolverDnDestroy(cusolverH);
       if (neg) od[0] *= -1;
+
+      if (info > 0) od[0] = 0;
     }
 
   }  // namespace linalg_internal
diff --git a/src/linalg/Det.cpp b/src/linalg/Det.cpp
index 938d95e6..a0566699 100644
--- a/src/linalg/Det.cpp
+++ b/src/linalg/Det.cpp
@@ -22,9 +22,9 @@ namespace cytnx {
       if (Tl.dtype() > 4) {
         // do conversion:
         _tl = _tl.astype(Type.Double);
-        out.Init({1}, Type.Double, Device.cpu);  // scalar, so on cpu always!
+        out.Init({1}, Type.Double, _tl.device());
       } else {
-        out.Init({1}, _tl.dtype(), Device.cpu);  // scalar, so on cpu always!
+        out.Init({1}, _tl.dtype(), _tl.device());
       }
 
       if (Tl.device() == Device.cpu) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 1c1b1a40..b665d93c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -30,6 +30,7 @@ add_executable(
   linalg_test/Svd_test.cpp
   linalg_test/GeSvd_test.cpp
   linalg_test/linalg_test.cpp
+  linalg_test/Det_test.cpp
   algo_test/Hsplit_test.cpp
   algo_test/Hstack_test.cpp
   algo_test/Vsplit_test.cpp
diff --git a/tests/gpu/CMakeLists.txt b/tests/gpu/CMakeLists.txt
index a76fdc6a..116718f1 100644
--- a/tests/gpu/CMakeLists.txt
+++ b/tests/gpu/CMakeLists.txt
@@ -22,6 +22,7 @@ add_executable(
   linalg_test/Svd_test.cpp
   linalg_test/GeSvd_test.cpp
   linalg_test/linalg_test.cpp
+  linalg_test/Det_test.cpp
   algo_test/Hsplit_test.cpp
   algo_test/Hstack_test.cpp
   algo_test/Vsplit_test.cpp
diff --git a/tests/gpu/linalg_test/Det_test.cpp b/tests/gpu/linalg_test/Det_test.cpp
new file mode 100644
index 00000000..d941a935
--- /dev/null
+++ b/tests/gpu/linalg_test/Det_test.cpp
@@ -0,0 +1,161 @@
+#include "cytnx.hpp"
+#include <gtest/gtest.h>
+#include "../test_tools.h"
+
+using namespace cytnx;
+using namespace testing;
+using namespace TestTools;
+
+namespace DetTest {
+
+  static cytnx_uint64 rand_seed1, rand_seed2;
+
+  void ExcuteDetTest(const Tensor& T);
+
+  void ErrorTestExcute(const Tensor& T);
+
+  /*=====test info=====
+  describe:Test all possible data type and check the results.
+  input:
+    T:Tensor with shape {6, 6} or {2, 2}, {1, 1}, {3, 3}, {4, 4} and test for all
+  possilbe data type.
+  ====================*/
+  TEST(Det, allDType) {
+    for (auto device : device_list) {
+      for (auto dtype1 : dtype_list) {
+        for (auto dtype2 : dtype_list) {
+          // The GPU version of Det is just too slow.
+          // So we lower the size of the tensor from 6,6 to 3,3.
+          Tensor T = Tensor({3, 3}, dtype1, device);
+          InitTensorUniform(T, rand_seed1 = 3);
+          ExcuteDetTest(T);
+
+          T = Tensor({2, 2}, dtype1, device);
+          InitTensorUniform(T, rand_seed1 = 3);
+          ExcuteDetTest(T);
+
+          T = Tensor({1, 1}, dtype1, device);
+          InitTensorUniform(T, rand_seed1 = 3);
+          ExcuteDetTest(T);
+
+          T = Tensor({3, 3}, dtype1, device);
+          InitTensorUniform(T, rand_seed1 = 3);
+          ExcuteDetTest(T);
+
+          T = Tensor({4, 4}, dtype1, device);
+          InitTensorUniform(T, rand_seed1 = 3);
+          ExcuteDetTest(T);
+        }
+      }
+    }
+  }
+
+  /*=====test info=====
+  describe:Test the tensor only 1 have one element. Test for all possible data type.
+  input:
+    T:Tensor with shape {1} on cpu, testing for all possible data type.
+  ====================*/
+  TEST(Det, one_elem_tens) {
+    for (auto dtype : dtype_list) {
+      Tensor T = Tensor({1, 1}, dtype);
+      InitTensorUniform(T, rand_seed1 = 3);
+      ExcuteDetTest(T);
+    }
+  }
+
+  /*=====test info=====
+  describe:Test for not contiguous tensor.
+  input:
+    T:Double data type not contiguous tensor with shape {9,9} on cpu.
+  ====================*/
+  TEST(Det, not_contiguous) {
+    Tensor T = Tensor({9, 9}, Type.Double);
+    InitTensorUniform(T, rand_seed1 = 1);
+    // permute then they will not contiguous
+    T.permute_({1, 0});  // shape:[9,9]
+    ExcuteDetTest(T);
+  }
+
+  // error test
+  /*=====test info=====
+  describe:Test the input tensors are both void tensor.
+  input:
+    T:void tensor
+  ====================*/
+  TEST(Det, err_void_tens) {
+    Tensor T = Tensor();
+    ErrorTestExcute(T);
+  }
+
+  /*=====test info=====
+  describe:Test contains shared axis of the tensors are not same.
+  input:
+    T:double type tensor with shape {2, 3} on cpu.
+  ====================*/
+  TEST(Det, err_axis_dim_wrong) {
+    Tensor T = Tensor({2, 3});
+    InitTensorUniform(T, rand_seed1 = 0);
+    ErrorTestExcute(T);
+  }
+
+  Tensor ConstructExpectTens(const Tensor& T) {
+    Tensor dst_T = zeros(1, T.dtype(), T.device());
+    int n = T.shape()[0];
+    cytnx_error_msg(
+      T.shape().size() != 2,
+      "[ERROR] [Det_test] [ConstructExpectTens]The input tensor should be rank-2 tensor.%s", "\n");
+    for (auto s : T.shape()) {
+      cytnx_error_msg(
+        s != n,
+        "[ERROR] [Det_test] [ConstructExpectTens]The input tensor should be square matrix.%s",
+        "\n");
+    }
+    if (n == 1) {
+      dst_T.at({0}) = T.at({0, 0});
+      return dst_T;
+    } else if (n == 2) {
+      dst_T.at({0}) = T.at({0, 0}) * T.at({1, 1}) - T.at({0, 1}) * T.at({1, 0});
+      return dst_T;
+    }
+    for (cytnx_uint64 a = 0; a < n; a++) {
+      Tensor T2 = zeros({T.shape()[0] - 1, T.shape()[1] - 1}, T.dtype(), T.device());
+      for (cytnx_uint64 i = 0; i < n - 1; i++) {
+        for (cytnx_uint64 j = 0; j < n - 1; j++) {
+          cytnx_uint64 ii = i + (i >= 0);
+          cytnx_uint64 jj = j + (j >= a);
+          T2.at({i, j}) = T.at({ii, jj});
+        }
+      }
+      dst_T({0}) += (a % 2 == 0 ? 1 : -1) * T.at({0, a}) * ConstructExpectTens(T2);
+    }
+    return dst_T;
+  }
+
+  void ExcuteDetTest(const Tensor& T) {
+    Tensor det_T = linalg::Det(T);
+    Tensor expect_T;
+    expect_T =
+      ConstructExpectTens(T.dtype() > 4 ? T.astype(Type.Double).to(-1) : T.to(-1)).to(T.device());
+    // std::cout << det_T << expect_T << std::endl;
+    const double tolerance =
+      std::pow(10.0, std::max((int)std::abs(std::log10(det_T(0).item<cytnx_double>())) - 6, 0));
+    // Because the determinant will be casted to at least double, so we just cast the result to
+    // ComplexDouble for comparison.
+    std::cout << "Type: " << Type.getname(T.dtype()) << std::endl;
+    EXPECT_TRUE(AreNearlyEqTensor(det_T.astype(Type.ComplexDouble),
+                                  expect_T.astype(Type.ComplexDouble), tolerance));
+  }
+
+  void ErrorTestExcute(const Tensor& T) {
+    try {
+      auto dirsum_T = linalg::Det(T);
+      std::cerr << "[Test Error] This test should throw error but not !" << std::endl;
+      FAIL();
+    } catch (const std::exception& ex) {
+      auto err_msg = ex.what();
+      std::cerr << err_msg << std::endl;
+      SUCCEED();
+    }
+  }
+
+}  // namespace DetTest
diff --git a/tests/gpu/linalg_test/Directsum_test.cpp b/tests/gpu/linalg_test/Directsum_test.cpp
index a5f65e47..947961da 100644
--- a/tests/gpu/linalg_test/Directsum_test.cpp
+++ b/tests/gpu/linalg_test/Directsum_test.cpp
@@ -24,7 +24,7 @@ namespace DirectsumTest {
     axes:{1}
   ====================*/
   TEST(Directsum, gpu_allDType) {
-    for (auto device : device_list) {  // now only test for cpu device.
+    for (auto device : device_list) {
       for (auto dtype1 : dtype_list) {
         for (auto dtype2 : dtype_list) {
           Tensor T1 = Tensor({12, 5, 7}, dtype1, device);
diff --git a/tests/gpu/test_tools.h b/tests/gpu/test_tools.h
index 81edfa3c..d4871289 100644
--- a/tests/gpu/test_tools.h
+++ b/tests/gpu/test_tools.h
@@ -35,8 +35,8 @@ namespace TestTools {
     Type.Int32,         Type.Uint32,       Type.Int16,  Type.Uint16, Type.Bool};
 
   static std::vector<int> device_list = {
-    Device.cpu,
-    // Device.cuda,  //currently cuda version still not implement
+    // Device.cpu,
+    Device.cuda,
   };
 
   // Tensor tools
diff --git a/tests/linalg_test/Det_test.cpp b/tests/linalg_test/Det_test.cpp
new file mode 100644
index 00000000..f40d168c
--- /dev/null
+++ b/tests/linalg_test/Det_test.cpp
@@ -0,0 +1,158 @@
+#include "cytnx.hpp"
+#include <gtest/gtest.h>
+#include "../test_tools.h"
+
+using namespace cytnx;
+using namespace testing;
+using namespace TestTools;
+
+namespace DetTest {
+
+  static cytnx_uint64 rand_seed1, rand_seed2;
+
+  void ExcuteDetTest(const Tensor& T);
+
+  void ErrorTestExcute(const Tensor& T);
+
+  /*=====test info=====
+  describe:Test all possible data type and check the results.
+  input:
+    T:Tensor with shape {6, 6} or {2, 2}, {1, 1}, {3, 3}, {4, 4} and test for all
+  possilbe data type.
+  ====================*/
+  TEST(Det, allDType) {
+    for (auto device : device_list) {  // now only test for cpu device.
+      for (auto dtype1 : dtype_list) {
+        for (auto dtype2 : dtype_list) {
+          Tensor T = Tensor({6, 6}, dtype1, device);
+          InitTensorUniform(T, rand_seed1 = 3);
+          ExcuteDetTest(T);
+
+          T = Tensor({2, 2}, dtype1, device);
+          InitTensorUniform(T, rand_seed1 = 3);
+          ExcuteDetTest(T);
+
+          T = Tensor({1, 1}, dtype1, device);
+          InitTensorUniform(T, rand_seed1 = 3);
+          ExcuteDetTest(T);
+
+          T = Tensor({3, 3}, dtype1, device);
+          InitTensorUniform(T, rand_seed1 = 3);
+          ExcuteDetTest(T);
+
+          T = Tensor({4, 4}, dtype1, device);
+          InitTensorUniform(T, rand_seed1 = 3);
+          ExcuteDetTest(T);
+        }
+      }
+    }
+  }
+
+  /*=====test info=====
+  describe:Test the tensor only 1 have one element. Test for all possible data type.
+  input:
+    T:Tensor with shape {1} on cpu, testing for all possible data type.
+  ====================*/
+  TEST(Det, one_elem_tens) {
+    for (auto dtype : dtype_list) {
+      Tensor T = Tensor({1, 1}, dtype);
+      InitTensorUniform(T, rand_seed1 = 3);
+      ExcuteDetTest(T);
+    }
+  }
+
+  /*=====test info=====
+  describe:Test for not contiguous tensor.
+  input:
+    T:Double data type not contiguous tensor with shape {9,9} on cpu.
+  ====================*/
+  TEST(Det, not_contiguous) {
+    Tensor T = Tensor({9, 9}, Type.Double);
+    InitTensorUniform(T, rand_seed1 = 1);
+    // permute then they will not contiguous
+    T.permute_({1, 0});  // shape:[9,9]
+    ExcuteDetTest(T);
+  }
+
+  // error test
+  /*=====test info=====
+  describe:Test the input tensors are both void tensor.
+  input:
+    T:void tensor
+  ====================*/
+  TEST(Det, err_void_tens) {
+    Tensor T = Tensor();
+    ErrorTestExcute(T);
+  }
+
+  /*=====test info=====
+  describe:Test contains shared axis of the tensors are not same.
+  input:
+    T:double type tensor with shape {2, 3} on cpu.
+  ====================*/
+  TEST(Det, err_axis_dim_wrong) {
+    Tensor T = Tensor({2, 3});
+    InitTensorUniform(T, rand_seed1 = 0);
+    ErrorTestExcute(T);
+  }
+
+  Tensor ConstructExpectTens(const Tensor& T) {
+    Tensor dst_T = zeros(1, T.dtype(), T.device());
+    int n = T.shape()[0];
+    cytnx_error_msg(
+      T.shape().size() != 2,
+      "[ERROR] [Det_test] [ConstructExpectTens]The input tensor should be rank-2 tensor.%s", "\n");
+    for (auto s : T.shape()) {
+      cytnx_error_msg(
+        s != n,
+        "[ERROR] [Det_test] [ConstructExpectTens]The input tensor should be square matrix.%s",
+        "\n");
+    }
+    if (n == 1) {
+      dst_T.at({0}) = T.at({0, 0});
+      return dst_T;
+    } else if (n == 2) {
+      dst_T.at({0}) = T.at({0, 0}) * T.at({1, 1}) - T.at({0, 1}) * T.at({1, 0});
+      return dst_T;
+    }
+    for (cytnx_uint64 a = 0; a < n; a++) {
+      Tensor T2 = zeros({T.shape()[0] - 1, T.shape()[1] - 1}, T.dtype(), T.device());
+      for (cytnx_uint64 i = 0; i < n - 1; i++) {
+        for (cytnx_uint64 j = 0; j < n - 1; j++) {
+          cytnx_uint64 ii = i + (i >= 0);
+          cytnx_uint64 jj = j + (j >= a);
+          T2.at({i, j}) = T.at({ii, jj});
+        }
+      }
+      dst_T({0}) += (a % 2 == 0 ? 1 : -1) * T.at({0, a}) * ConstructExpectTens(T2);
+    }
+    return dst_T;
+  }
+
+  void ExcuteDetTest(const Tensor& T) {
+    Tensor det_T = linalg::Det(T);
+    Tensor expect_T;
+    expect_T = ConstructExpectTens(T.dtype() > 4 ? T.astype(Type.Double) : T);
+    // std::cout << det_T << expect_T << std::endl;
+    const double tolerance =
+      std::pow(10.0, std::max((int)std::abs(std::log10(det_T(0).item<cytnx_double>())) - 6, 0));
+    // Because the determinant will be casted to at least double, so we just cast the result to
+    // ComplexDouble for comparison.
+    std::cout << "Type: " << Type.getname(T.dtype()) << std::endl;
+    EXPECT_TRUE(AreNearlyEqTensor(det_T.astype(Type.ComplexDouble),
+                                  expect_T.astype(Type.ComplexDouble), tolerance));
+  }
+
+  void ErrorTestExcute(const Tensor& T) {
+    try {
+      auto dirsum_T = linalg::Det(T);
+      std::cerr << "[Test Error] This test should throw error but not !" << std::endl;
+      FAIL();
+    } catch (const std::exception& ex) {
+      auto err_msg = ex.what();
+      std::cerr << err_msg << std::endl;
+      SUCCEED();
+    }
+  }
+
+}  // namespace DetTest

From cd22a1c5a3326dd2a7315326e7527ac7855563dd Mon Sep 17 00:00:00 2001
From: jeffry1829 <jeffry1829@gmail.com>
Date: Wed, 2 Oct 2024 19:05:22 +0800
Subject: [PATCH 6/6] Revert "fix (cu)Det error when U is singular &"

This reverts commit 35c95aa2296d8a37afb56dab50dddbb38d04070d.
---
 .../linalg_internal_cpu/Det_internal.cpp      |  20 +--
 .../linalg_internal_gpu/cuDet_internal.cu     |  20 +--
 src/linalg/Det.cpp                            |   4 +-
 tests/CMakeLists.txt                          |   1 -
 tests/gpu/CMakeLists.txt                      |   1 -
 tests/gpu/linalg_test/Det_test.cpp            | 161 ------------------
 tests/gpu/linalg_test/Directsum_test.cpp      |   2 +-
 tests/gpu/test_tools.h                        |   4 +-
 tests/linalg_test/Det_test.cpp                | 158 -----------------
 9 files changed, 13 insertions(+), 358 deletions(-)
 delete mode 100644 tests/gpu/linalg_test/Det_test.cpp
 delete mode 100644 tests/linalg_test/Det_test.cpp

diff --git a/src/backend/linalg_internal_cpu/Det_internal.cpp b/src/backend/linalg_internal_cpu/Det_internal.cpp
index c0bdfd14..c507059e 100644
--- a/src/backend/linalg_internal_cpu/Det_internal.cpp
+++ b/src/backend/linalg_internal_cpu/Det_internal.cpp
@@ -27,9 +27,8 @@ namespace cytnx {
       lapack_int N = L;
       lapack_int info;
       info = LAPACKE_zgetrf(LAPACK_COL_MAJOR, N, N, _Rin, N, ipiv);
-      // If the info > 0, that means the U factor is exactly singular, and the det is 0.
       cytnx_error_msg(
-        info < 0, "%s %d",
+        info != 0, "%s %d",
         "[ERROR][Det_internal] Error in Lapack function 'zgetrf': Lapack INFO = ", info);
 
       // Whether lapack_complex_T is defined as std::complex<T> (c++ complex) or T _Complex
@@ -52,8 +51,6 @@ namespace cytnx {
       free(ipiv);
       free(_Rin);
       if (neg) od[0] *= -1;
-
-      if (info > 0) od[0] = 0;
     }
     void Det_internal_cf(void *out, const boost::intrusive_ptr<Storage_base> &Rin,
                          const cytnx_uint64 &L) {
@@ -66,9 +63,8 @@ namespace cytnx {
       lapack_int N = L;
       lapack_int info;
       info = LAPACKE_cgetrf(LAPACK_COL_MAJOR, N, N, _Rin, N, ipiv);
-      // If the info > 0, that means the U factor is exactly singular, and the det is 0.
       cytnx_error_msg(
-        info < 0, "%s %d",
+        info != 0, "%s %d",
         "[ERROR][Det_internal] Error in Lapack function 'cgetrf': Lapack INFO = ", info);
 
       // Whether lapack_complex_T is defined as std::complex<T> (c++ complex) or T _Complex
@@ -91,8 +87,6 @@ namespace cytnx {
       free(ipiv);
       free(_Rin);
       if (neg) od[0] *= -1;
-
-      if (info > 0) od[0] = 0;
     }
     void Det_internal_d(void *out, const boost::intrusive_ptr<Storage_base> &Rin,
                         const cytnx_uint64 &L) {
@@ -104,9 +98,8 @@ namespace cytnx {
       lapack_int N = L;
       lapack_int info;
       info = LAPACKE_dgetrf(LAPACK_COL_MAJOR, N, N, _Rin, N, ipiv);
-      // If the info > 0, that means the U factor is exactly singular, and the det is 0.
       cytnx_error_msg(
-        info < 0, "%s %d",
+        info != 0, "%s %d",
         "[ERROR][Det_internal] Error in Lapack function 'dgetrf': Lapack INFO = ", info);
       od[0] = 1;
       bool neg = 0;
@@ -117,8 +110,6 @@ namespace cytnx {
       free(ipiv);
       free(_Rin);
       if (neg) od[0] *= -1;
-
-      if (info > 0) od[0] = 0;
     }
     void Det_internal_f(void *out, const boost::intrusive_ptr<Storage_base> &Rin,
                         const cytnx_uint64 &L) {
@@ -130,9 +121,8 @@ namespace cytnx {
       lapack_int N = L;
       lapack_int info;
       info = LAPACKE_sgetrf(LAPACK_COL_MAJOR, N, N, _Rin, N, ipiv);
-      // If the info > 0, that means the U factor is exactly singular, and the det is 0.
       cytnx_error_msg(
-        info < 0, "%s %d",
+        info != 0, "%s %d",
         "[ERROR][Det_internal] Error in Lapack function 'sgetrf': Lapack INFO = ", info);
       od[0] = 1;
       bool neg = 0;
@@ -143,8 +133,6 @@ namespace cytnx {
       free(ipiv);
       free(_Rin);
       if (neg) od[0] *= -1;
-
-      if (info > 0) od[0] = 0;
     }
 
   }  // namespace linalg_internal
diff --git a/src/backend/linalg_internal_gpu/cuDet_internal.cu b/src/backend/linalg_internal_gpu/cuDet_internal.cu
index 5d8a5c81..7cf02265 100644
--- a/src/backend/linalg_internal_gpu/cuDet_internal.cu
+++ b/src/backend/linalg_internal_gpu/cuDet_internal.cu
@@ -34,8 +34,7 @@ namespace cytnx {
 
       int info;
       checkCudaErrors(cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
-      // If the info > 0, that means the U factor is exactly singular, and the det is 0.
-      cytnx_error_msg(info < 0, "[ERROR] cusolverDnZgetrf fail with info= %d\n", info);
+      cytnx_error_msg(info != 0, "[ERROR] cusolverDnZgetrf fail with info= %d\n", info);
 
       // since we do unify mem, direct access element is possible:
       od[0] = 1;
@@ -53,8 +52,6 @@ namespace cytnx {
       cudaFree(_in);
       cusolverDnDestroy(cusolverH);
       if (neg) od[0] *= -1;
-
-      if (info > 0) od[0] = 0;
     }
 
     void cuDet_internal_cf(void* out, const boost::intrusive_ptr<Storage_base>& in,
@@ -82,8 +79,7 @@ namespace cytnx {
 
       int info;
       checkCudaErrors(cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
-      // If the info > 0, that means the U factor is exactly singular, and the det is 0.
-      cytnx_error_msg(info < 0, "[ERROR] cusolverDnCgetrf fail with info= %d\n", info);
+      cytnx_error_msg(info != 0, "[ERROR] cusolverDnCgetrf fail with info= %d\n", info);
 
       // since we do unify mem, direct access element is possible:
       od[0] = 1;
@@ -101,8 +97,6 @@ namespace cytnx {
       cudaFree(_in);
       cusolverDnDestroy(cusolverH);
       if (neg) od[0] *= -1;
-
-      if (info > 0) od[0] = 0;
     }
 
     void cuDet_internal_d(void* out, const boost::intrusive_ptr<Storage_base>& in,
@@ -130,8 +124,7 @@ namespace cytnx {
 
       int info;
       checkCudaErrors(cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
-      // If the info > 0, that means the U factor is exactly singular, and the det is 0.
-      cytnx_error_msg(info < 0, "[ERROR] cusolverDnDgetrf fail with info= %d\n", info);
+      cytnx_error_msg(info != 0, "[ERROR] cusolverDnDgetrf fail with info= %d\n", info);
 
       // since we do unify mem, direct access element is possible:
       od[0] = 1;
@@ -149,8 +142,6 @@ namespace cytnx {
       cudaFree(_in);
       cusolverDnDestroy(cusolverH);
       if (neg) od[0] *= -1;
-
-      if (info > 0) od[0] = 0;
     }
 
     void cuDet_internal_f(void* out, const boost::intrusive_ptr<Storage_base>& in,
@@ -178,8 +169,7 @@ namespace cytnx {
 
       int info;
       checkCudaErrors(cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
-      // If the info > 0, that means the U factor is exactly singular, and the det is 0.
-      cytnx_error_msg(info < 0, "[ERROR] cusolverDnSgetrf fail with info= %d\n", info);
+      cytnx_error_msg(info != 0, "[ERROR] cusolverDnSgetrf fail with info= %d\n", info);
 
       // since we do unify mem, direct access element is possible:
       od[0] = 1;
@@ -197,8 +187,6 @@ namespace cytnx {
       cudaFree(_in);
       cusolverDnDestroy(cusolverH);
       if (neg) od[0] *= -1;
-
-      if (info > 0) od[0] = 0;
     }
 
   }  // namespace linalg_internal
diff --git a/src/linalg/Det.cpp b/src/linalg/Det.cpp
index a0566699..938d95e6 100644
--- a/src/linalg/Det.cpp
+++ b/src/linalg/Det.cpp
@@ -22,9 +22,9 @@ namespace cytnx {
       if (Tl.dtype() > 4) {
         // do conversion:
         _tl = _tl.astype(Type.Double);
-        out.Init({1}, Type.Double, _tl.device());
+        out.Init({1}, Type.Double, Device.cpu);  // scalar, so on cpu always!
       } else {
-        out.Init({1}, _tl.dtype(), _tl.device());
+        out.Init({1}, _tl.dtype(), Device.cpu);  // scalar, so on cpu always!
       }
 
       if (Tl.device() == Device.cpu) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b665d93c..1c1b1a40 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -30,7 +30,6 @@ add_executable(
   linalg_test/Svd_test.cpp
   linalg_test/GeSvd_test.cpp
   linalg_test/linalg_test.cpp
-  linalg_test/Det_test.cpp
   algo_test/Hsplit_test.cpp
   algo_test/Hstack_test.cpp
   algo_test/Vsplit_test.cpp
diff --git a/tests/gpu/CMakeLists.txt b/tests/gpu/CMakeLists.txt
index 116718f1..a76fdc6a 100644
--- a/tests/gpu/CMakeLists.txt
+++ b/tests/gpu/CMakeLists.txt
@@ -22,7 +22,6 @@ add_executable(
   linalg_test/Svd_test.cpp
   linalg_test/GeSvd_test.cpp
   linalg_test/linalg_test.cpp
-  linalg_test/Det_test.cpp
   algo_test/Hsplit_test.cpp
   algo_test/Hstack_test.cpp
   algo_test/Vsplit_test.cpp
diff --git a/tests/gpu/linalg_test/Det_test.cpp b/tests/gpu/linalg_test/Det_test.cpp
deleted file mode 100644
index d941a935..00000000
--- a/tests/gpu/linalg_test/Det_test.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-#include "cytnx.hpp"
-#include <gtest/gtest.h>
-#include "../test_tools.h"
-
-using namespace cytnx;
-using namespace testing;
-using namespace TestTools;
-
-namespace DetTest {
-
-  static cytnx_uint64 rand_seed1, rand_seed2;
-
-  void ExcuteDetTest(const Tensor& T);
-
-  void ErrorTestExcute(const Tensor& T);
-
-  /*=====test info=====
-  describe:Test all possible data type and check the results.
-  input:
-    T:Tensor with shape {6, 6} or {2, 2}, {1, 1}, {3, 3}, {4, 4} and test for all
-  possilbe data type.
-  ====================*/
-  TEST(Det, allDType) {
-    for (auto device : device_list) {
-      for (auto dtype1 : dtype_list) {
-        for (auto dtype2 : dtype_list) {
-          // The GPU version of Det is just too slow.
-          // So we lower the size of the tensor from 6,6 to 3,3.
-          Tensor T = Tensor({3, 3}, dtype1, device);
-          InitTensorUniform(T, rand_seed1 = 3);
-          ExcuteDetTest(T);
-
-          T = Tensor({2, 2}, dtype1, device);
-          InitTensorUniform(T, rand_seed1 = 3);
-          ExcuteDetTest(T);
-
-          T = Tensor({1, 1}, dtype1, device);
-          InitTensorUniform(T, rand_seed1 = 3);
-          ExcuteDetTest(T);
-
-          T = Tensor({3, 3}, dtype1, device);
-          InitTensorUniform(T, rand_seed1 = 3);
-          ExcuteDetTest(T);
-
-          T = Tensor({4, 4}, dtype1, device);
-          InitTensorUniform(T, rand_seed1 = 3);
-          ExcuteDetTest(T);
-        }
-      }
-    }
-  }
-
-  /*=====test info=====
-  describe:Test the tensor only 1 have one element. Test for all possible data type.
-  input:
-    T:Tensor with shape {1} on cpu, testing for all possible data type.
-  ====================*/
-  TEST(Det, one_elem_tens) {
-    for (auto dtype : dtype_list) {
-      Tensor T = Tensor({1, 1}, dtype);
-      InitTensorUniform(T, rand_seed1 = 3);
-      ExcuteDetTest(T);
-    }
-  }
-
-  /*=====test info=====
-  describe:Test for not contiguous tensor.
-  input:
-    T:Double data type not contiguous tensor with shape {9,9} on cpu.
-  ====================*/
-  TEST(Det, not_contiguous) {
-    Tensor T = Tensor({9, 9}, Type.Double);
-    InitTensorUniform(T, rand_seed1 = 1);
-    // permute then they will not contiguous
-    T.permute_({1, 0});  // shape:[9,9]
-    ExcuteDetTest(T);
-  }
-
-  // error test
-  /*=====test info=====
-  describe:Test the input tensors are both void tensor.
-  input:
-    T:void tensor
-  ====================*/
-  TEST(Det, err_void_tens) {
-    Tensor T = Tensor();
-    ErrorTestExcute(T);
-  }
-
-  /*=====test info=====
-  describe:Test contains shared axis of the tensors are not same.
-  input:
-    T:double type tensor with shape {2, 3} on cpu.
-  ====================*/
-  TEST(Det, err_axis_dim_wrong) {
-    Tensor T = Tensor({2, 3});
-    InitTensorUniform(T, rand_seed1 = 0);
-    ErrorTestExcute(T);
-  }
-
-  Tensor ConstructExpectTens(const Tensor& T) {
-    Tensor dst_T = zeros(1, T.dtype(), T.device());
-    int n = T.shape()[0];
-    cytnx_error_msg(
-      T.shape().size() != 2,
-      "[ERROR] [Det_test] [ConstructExpectTens]The input tensor should be rank-2 tensor.%s", "\n");
-    for (auto s : T.shape()) {
-      cytnx_error_msg(
-        s != n,
-        "[ERROR] [Det_test] [ConstructExpectTens]The input tensor should be square matrix.%s",
-        "\n");
-    }
-    if (n == 1) {
-      dst_T.at({0}) = T.at({0, 0});
-      return dst_T;
-    } else if (n == 2) {
-      dst_T.at({0}) = T.at({0, 0}) * T.at({1, 1}) - T.at({0, 1}) * T.at({1, 0});
-      return dst_T;
-    }
-    for (cytnx_uint64 a = 0; a < n; a++) {
-      Tensor T2 = zeros({T.shape()[0] - 1, T.shape()[1] - 1}, T.dtype(), T.device());
-      for (cytnx_uint64 i = 0; i < n - 1; i++) {
-        for (cytnx_uint64 j = 0; j < n - 1; j++) {
-          cytnx_uint64 ii = i + (i >= 0);
-          cytnx_uint64 jj = j + (j >= a);
-          T2.at({i, j}) = T.at({ii, jj});
-        }
-      }
-      dst_T({0}) += (a % 2 == 0 ? 1 : -1) * T.at({0, a}) * ConstructExpectTens(T2);
-    }
-    return dst_T;
-  }
-
-  void ExcuteDetTest(const Tensor& T) {
-    Tensor det_T = linalg::Det(T);
-    Tensor expect_T;
-    expect_T =
-      ConstructExpectTens(T.dtype() > 4 ? T.astype(Type.Double).to(-1) : T.to(-1)).to(T.device());
-    // std::cout << det_T << expect_T << std::endl;
-    const double tolerance =
-      std::pow(10.0, std::max((int)std::abs(std::log10(det_T(0).item<cytnx_double>())) - 6, 0));
-    // Because the determinant will be casted to at least double, so we just cast the result to
-    // ComplexDouble for comparison.
-    std::cout << "Type: " << Type.getname(T.dtype()) << std::endl;
-    EXPECT_TRUE(AreNearlyEqTensor(det_T.astype(Type.ComplexDouble),
-                                  expect_T.astype(Type.ComplexDouble), tolerance));
-  }
-
-  void ErrorTestExcute(const Tensor& T) {
-    try {
-      auto dirsum_T = linalg::Det(T);
-      std::cerr << "[Test Error] This test should throw error but not !" << std::endl;
-      FAIL();
-    } catch (const std::exception& ex) {
-      auto err_msg = ex.what();
-      std::cerr << err_msg << std::endl;
-      SUCCEED();
-    }
-  }
-
-}  // namespace DetTest
diff --git a/tests/gpu/linalg_test/Directsum_test.cpp b/tests/gpu/linalg_test/Directsum_test.cpp
index 947961da..a5f65e47 100644
--- a/tests/gpu/linalg_test/Directsum_test.cpp
+++ b/tests/gpu/linalg_test/Directsum_test.cpp
@@ -24,7 +24,7 @@ namespace DirectsumTest {
     axes:{1}
   ====================*/
   TEST(Directsum, gpu_allDType) {
-    for (auto device : device_list) {
+    for (auto device : device_list) {  // now only test for cpu device.
       for (auto dtype1 : dtype_list) {
         for (auto dtype2 : dtype_list) {
           Tensor T1 = Tensor({12, 5, 7}, dtype1, device);
diff --git a/tests/gpu/test_tools.h b/tests/gpu/test_tools.h
index d4871289..81edfa3c 100644
--- a/tests/gpu/test_tools.h
+++ b/tests/gpu/test_tools.h
@@ -35,8 +35,8 @@ namespace TestTools {
     Type.Int32,         Type.Uint32,       Type.Int16,  Type.Uint16, Type.Bool};
 
   static std::vector<int> device_list = {
-    // Device.cpu,
-    Device.cuda,
+    Device.cpu,
+    // Device.cuda,  //currently cuda version still not implement
   };
 
   // Tensor tools
diff --git a/tests/linalg_test/Det_test.cpp b/tests/linalg_test/Det_test.cpp
deleted file mode 100644
index f40d168c..00000000
--- a/tests/linalg_test/Det_test.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-#include "cytnx.hpp"
-#include <gtest/gtest.h>
-#include "../test_tools.h"
-
-using namespace cytnx;
-using namespace testing;
-using namespace TestTools;
-
-namespace DetTest {
-
-  static cytnx_uint64 rand_seed1, rand_seed2;
-
-  void ExcuteDetTest(const Tensor& T);
-
-  void ErrorTestExcute(const Tensor& T);
-
-  /*=====test info=====
-  describe:Test all possible data type and check the results.
-  input:
-    T:Tensor with shape {6, 6} or {2, 2}, {1, 1}, {3, 3}, {4, 4} and test for all
-  possilbe data type.
-  ====================*/
-  TEST(Det, allDType) {
-    for (auto device : device_list) {  // now only test for cpu device.
-      for (auto dtype1 : dtype_list) {
-        for (auto dtype2 : dtype_list) {
-          Tensor T = Tensor({6, 6}, dtype1, device);
-          InitTensorUniform(T, rand_seed1 = 3);
-          ExcuteDetTest(T);
-
-          T = Tensor({2, 2}, dtype1, device);
-          InitTensorUniform(T, rand_seed1 = 3);
-          ExcuteDetTest(T);
-
-          T = Tensor({1, 1}, dtype1, device);
-          InitTensorUniform(T, rand_seed1 = 3);
-          ExcuteDetTest(T);
-
-          T = Tensor({3, 3}, dtype1, device);
-          InitTensorUniform(T, rand_seed1 = 3);
-          ExcuteDetTest(T);
-
-          T = Tensor({4, 4}, dtype1, device);
-          InitTensorUniform(T, rand_seed1 = 3);
-          ExcuteDetTest(T);
-        }
-      }
-    }
-  }
-
-  /*=====test info=====
-  describe:Test the tensor only 1 have one element. Test for all possible data type.
-  input:
-    T:Tensor with shape {1} on cpu, testing for all possible data type.
-  ====================*/
-  TEST(Det, one_elem_tens) {
-    for (auto dtype : dtype_list) {
-      Tensor T = Tensor({1, 1}, dtype);
-      InitTensorUniform(T, rand_seed1 = 3);
-      ExcuteDetTest(T);
-    }
-  }
-
-  /*=====test info=====
-  describe:Test for not contiguous tensor.
-  input:
-    T:Double data type not contiguous tensor with shape {9,9} on cpu.
-  ====================*/
-  TEST(Det, not_contiguous) {
-    Tensor T = Tensor({9, 9}, Type.Double);
-    InitTensorUniform(T, rand_seed1 = 1);
-    // permute then they will not contiguous
-    T.permute_({1, 0});  // shape:[9,9]
-    ExcuteDetTest(T);
-  }
-
-  // error test
-  /*=====test info=====
-  describe:Test the input tensors are both void tensor.
-  input:
-    T:void tensor
-  ====================*/
-  TEST(Det, err_void_tens) {
-    Tensor T = Tensor();
-    ErrorTestExcute(T);
-  }
-
-  /*=====test info=====
-  describe:Test contains shared axis of the tensors are not same.
-  input:
-    T:double type tensor with shape {2, 3} on cpu.
-  ====================*/
-  TEST(Det, err_axis_dim_wrong) {
-    Tensor T = Tensor({2, 3});
-    InitTensorUniform(T, rand_seed1 = 0);
-    ErrorTestExcute(T);
-  }
-
-  Tensor ConstructExpectTens(const Tensor& T) {
-    Tensor dst_T = zeros(1, T.dtype(), T.device());
-    int n = T.shape()[0];
-    cytnx_error_msg(
-      T.shape().size() != 2,
-      "[ERROR] [Det_test] [ConstructExpectTens]The input tensor should be rank-2 tensor.%s", "\n");
-    for (auto s : T.shape()) {
-      cytnx_error_msg(
-        s != n,
-        "[ERROR] [Det_test] [ConstructExpectTens]The input tensor should be square matrix.%s",
-        "\n");
-    }
-    if (n == 1) {
-      dst_T.at({0}) = T.at({0, 0});
-      return dst_T;
-    } else if (n == 2) {
-      dst_T.at({0}) = T.at({0, 0}) * T.at({1, 1}) - T.at({0, 1}) * T.at({1, 0});
-      return dst_T;
-    }
-    for (cytnx_uint64 a = 0; a < n; a++) {
-      Tensor T2 = zeros({T.shape()[0] - 1, T.shape()[1] - 1}, T.dtype(), T.device());
-      for (cytnx_uint64 i = 0; i < n - 1; i++) {
-        for (cytnx_uint64 j = 0; j < n - 1; j++) {
-          cytnx_uint64 ii = i + (i >= 0);
-          cytnx_uint64 jj = j + (j >= a);
-          T2.at({i, j}) = T.at({ii, jj});
-        }
-      }
-      dst_T({0}) += (a % 2 == 0 ? 1 : -1) * T.at({0, a}) * ConstructExpectTens(T2);
-    }
-    return dst_T;
-  }
-
-  void ExcuteDetTest(const Tensor& T) {
-    Tensor det_T = linalg::Det(T);
-    Tensor expect_T;
-    expect_T = ConstructExpectTens(T.dtype() > 4 ? T.astype(Type.Double) : T);
-    // std::cout << det_T << expect_T << std::endl;
-    const double tolerance =
-      std::pow(10.0, std::max((int)std::abs(std::log10(det_T(0).item<cytnx_double>())) - 6, 0));
-    // Because the determinant will be casted to at least double, so we just cast the result to
-    // ComplexDouble for comparison.
-    std::cout << "Type: " << Type.getname(T.dtype()) << std::endl;
-    EXPECT_TRUE(AreNearlyEqTensor(det_T.astype(Type.ComplexDouble),
-                                  expect_T.astype(Type.ComplexDouble), tolerance));
-  }
-
-  void ErrorTestExcute(const Tensor& T) {
-    try {
-      auto dirsum_T = linalg::Det(T);
-      std::cerr << "[Test Error] This test should throw error but not !" << std::endl;
-      FAIL();
-    } catch (const std::exception& ex) {
-      auto err_msg = ex.what();
-      std::cerr << err_msg << std::endl;
-      SUCCEED();
-    }
-  }
-
-}  // namespace DetTest