Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove Magma dependency by using CuSolver Routines #483

Merged
merged 6 commits into from
Oct 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ option(BUILD_DOC "Build API docuemntation" OFF)
option(USE_HPTT "Build Cytnx with HPTT" OFF)
option(RUN_TESTS "Run Cytnx tests" OFF)
option(USE_CUTT "Build Cytnx with CUTT" OFF)
option(USE_MAGMA "Build Cytnx with MAGMA (requires CUDA)" ON)
option(USE_CUTENSOR "Build Cytnx with CuTensor (requires CUDA)" ON)
option(USE_CUQUANTUM "Build Cytnx with CUQuantum (requires CUDA)" ON)

Expand Down
20 changes: 0 additions & 20 deletions CytnxBKNDCMakeLists.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -169,26 +169,6 @@ if(USE_CUDA)

endif()

if(USE_MAGMA)
find_package( MAGMA REQUIRED)
if(NOT MAGMA_FOUND)
message(FATAL_ERROR "MAGMA not found!")
endif()
message(STATUS "^^^magma root aft: ${MAGMA_ROOT}")
message(STATUS "^^^magma inc dr: ${MAGMA_INCLUDE_DIRS}")
message(STATUS "^^^magma lib dr: ${MAGMA_LIBRARY_DIRS}")
message(STATUS "^^^magma libs: ${MAGMA_LIBRARIES}")
#add_dependencies(cytnx magma)
target_include_directories(cytnx PRIVATE ${MAGMA_INCLUDE_DIRS})
target_compile_definitions(cytnx PRIVATE UNI_MAGMA)
target_link_libraries(cytnx PUBLIC ${MAGMA_LIBRARIES})

message( STATUS "Build with MAGMA: YES")
FILE(APPEND "${CMAKE_BINARY_DIR}/cxxflags.tmp" "-DUNI_MAGMA\n" "")
FILE(APPEND "${CMAKE_BINARY_DIR}/cxxflags.tmp" "-I${MAGMA_INCLUDE_DIRS}\n" "")
FILE(APPEND "${CMAKE_BINARY_DIR}/linkflags.tmp" "${MAGMA_LIBRARIES} -ldl\n" "") # use > to indicate special rt processing
message( STATUS "MAGMA: libdir:${MAGMA_LIBRARY_DIRS} incdir:${MAGMA_INCLUDE_DIRS} libs:${MAGMA_LIBRARIES}")
endif()

message( STATUS " Build CUDA Support: YES")
message( STATUS " - CUDA Version: ${CUDA_VERSION_STRING}")
Expand Down
4 changes: 2 additions & 2 deletions cmake/Modules/FindCUQUANTUM.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
# CUQUANTUM_INCLUDE_DIRS ... cutensor include directory
# CUQUANTUM_LIBRARIES ... cutensor libraries
#
# MAGMA_ROOT this is required to set!
# CUQUANTUM_ROOT this is required to set!
#

#If environment variable MAGMA_ROOT is specified, it has same effect as MAGMA_ROOT
#If environment variable CUQUANTUM_ROOT is specified, it has same effect as CUQUANTUM_ROOT

if(NOT DEFINED ENV{CUQUANTUM_ROOT} AND NOT DEFINED CUQUANTUM_ROOT)
message(FATAL_ERROR "CUQUANTUM_ROOT not set!")
Expand Down
16 changes: 1 addition & 15 deletions src/Device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@
#include <omp.h>
#endif

#ifdef UNI_MAGMA
#include "magma_v2.h"
#endif

using namespace std;
namespace cytnx {

Expand Down Expand Up @@ -43,12 +39,6 @@ namespace cytnx {
}
}

// #ifdef UNI_MAGMA
// int magma_status = magma_init();
// cytnx_error_msg(magma_status!=MAGMA_SUCCESS,"[ERROR] magma system cannot
// initialize!%s","\n");
// #endif

#endif

#ifdef UNI_OMP
Expand All @@ -62,11 +52,7 @@ namespace cytnx {
};

Device_class::~Device_class(){
// #ifdef UNI_MAGMA
// int magma_status = magma_finalize();
// cytnx_error_msg(magma_status!=MAGMA_SUCCESS,"[ERROR] magma system cannot
// finalize!%s","\n");
// #endif

};

string Device_class::getname(const int& device_id) {
Expand Down
180 changes: 108 additions & 72 deletions src/backend/linalg_internal_gpu/cuDet_internal.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,152 +5,188 @@

#include "../utils_internal_gpu/cuAlloc_gpu.hpp"

#ifdef UNI_MAGMA
#include "magma_v2.h"
#endif

namespace cytnx {

namespace linalg_internal {

void cuDet_internal_cd(void* out, const boost::intrusive_ptr<Storage_base>& in,
const cytnx_uint64& L) {
#ifdef UNI_MAGMA
cytnx_complex128* od = (cytnx_complex128*)out; // result on cpu!
cuDoubleComplex* _in = (cuDoubleComplex*)utils_internal::cuMalloc_gpu(
in->len * sizeof(cuDoubleComplex)); // unify mem.
checkCudaErrors(
cudaMemcpy(_in, in->Mem, sizeof(cytnx_complex128) * in->len, cudaMemcpyDeviceToDevice));

magma_int_t* ipiv;
magma_imalloc_cpu(&ipiv, L + 1);
magma_int_t N = L;
magma_int_t info;
magma_zgetrf_gpu(N, N, _in, N, ipiv, &info);
cytnx_error_msg(info != 0, "[ERROR] magma_zgetrf_gpu fail with info= %d\n", info);
cusolverDnHandle_t cusolverH;
cusolverDnCreate(&cusolverH);

int* devIpiv;
int* devInfo;
checkCudaErrors(cudaMalloc((void**)&devIpiv, L * sizeof(int)));
checkCudaErrors(cudaMalloc((void**)&devInfo, sizeof(int)));

int workspace_size = 0;
cuDoubleComplex* workspace = NULL;
cusolverDnZgetrf_bufferSize(cusolverH, L, L, _in, L, &workspace_size);
checkCudaErrors(cudaMalloc((void**)&workspace, workspace_size * sizeof(cuDoubleComplex)));

cusolverDnZgetrf(cusolverH, L, L, _in, L, workspace, devIpiv, devInfo);

int info;
checkCudaErrors(cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
cytnx_error_msg(info != 0, "[ERROR] cusolverDnZgetrf fail with info= %d\n", info);

// since we do unify mem, direct access element is possible:
od[0] = 1;
bool neg = 0;
for (magma_int_t i = 0; i < N; i++) {
od[0] *= ((cytnx_complex128*)_in)[i * N + i];
int* ipiv = new int[L];
checkCudaErrors(cudaMemcpy(ipiv, devIpiv, L * sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < L; i++) {
od[0] *= ((cytnx_complex128*)_in)[i * L + i];
if (ipiv[i] != (i + 1)) neg = !neg;
}
magma_free_cpu(ipiv);
delete[] ipiv;
cudaFree(devIpiv);
cudaFree(devInfo);
cudaFree(workspace);
cudaFree(_in);
cusolverDnDestroy(cusolverH);
if (neg) od[0] *= -1;

#else
cytnx_error_msg(true,
"[ERROR][internal Det] Det for Tensor on GPU require magma. please "
"re-compiling cytnx with magma.%s",
"\n");
#endif
}

void cuDet_internal_cf(void* out, const boost::intrusive_ptr<Storage_base>& in,
const cytnx_uint64& L) {
#ifdef UNI_MAGMA
cytnx_complex64* od = (cytnx_complex64*)out; // result on cpu!
cuFloatComplex* _in = (cuFloatComplex*)utils_internal::cuMalloc_gpu(
in->len * sizeof(cuFloatComplex)); // unify mem.
checkCudaErrors(
cudaMemcpy(_in, in->Mem, sizeof(cytnx_complex64) * in->len, cudaMemcpyDeviceToDevice));

magma_int_t* ipiv;
magma_imalloc_cpu(&ipiv, L + 1);
magma_int_t N = L;
magma_int_t info;
magma_cgetrf_gpu(N, N, _in, N, ipiv, &info);
cytnx_error_msg(info != 0, "[ERROR] magma_cgetrf_gpu fail with info= %d\n", info);
cusolverDnHandle_t cusolverH;
cusolverDnCreate(&cusolverH);

int* devIpiv;
int* devInfo;
checkCudaErrors(cudaMalloc((void**)&devIpiv, L * sizeof(int)));
checkCudaErrors(cudaMalloc((void**)&devInfo, sizeof(int)));

int workspace_size = 0;
cuFloatComplex* workspace = NULL;
cusolverDnCgetrf_bufferSize(cusolverH, L, L, _in, L, &workspace_size);
checkCudaErrors(cudaMalloc((void**)&workspace, workspace_size * sizeof(cuFloatComplex)));

cusolverDnCgetrf(cusolverH, L, L, _in, L, workspace, devIpiv, devInfo);

int info;
checkCudaErrors(cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
cytnx_error_msg(info != 0, "[ERROR] cusolverDnCgetrf fail with info= %d\n", info);

// since we do unify mem, direct access element is possible:
od[0] = 1;
bool neg = 0;
for (magma_int_t i = 0; i < N; i++) {
od[0] *= ((cytnx_complex64*)_in)[i * N + i];
int* ipiv = new int[L];
checkCudaErrors(cudaMemcpy(ipiv, devIpiv, L * sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < L; i++) {
od[0] *= ((cytnx_complex64*)_in)[i * L + i];
if (ipiv[i] != (i + 1)) neg = !neg;
}
magma_free_cpu(ipiv);
delete[] ipiv;
cudaFree(devIpiv);
cudaFree(devInfo);
cudaFree(workspace);
cudaFree(_in);
cusolverDnDestroy(cusolverH);
if (neg) od[0] *= -1;

#else
cytnx_error_msg(true,
"[ERROR][internal Det] Det for Tensor on GPU require magma. please "
"re-compiling cytnx with magma.%s",
"\n");
#endif
}

void cuDet_internal_d(void* out, const boost::intrusive_ptr<Storage_base>& in,
const cytnx_uint64& L) {
#ifdef UNI_MAGMA
cytnx_double* od = (cytnx_double*)out; // result on cpu!
cytnx_double* _in =
(cytnx_double*)utils_internal::cuMalloc_gpu(in->len * sizeof(cytnx_double)); // unify mem.
checkCudaErrors(
cudaMemcpy(_in, in->Mem, sizeof(cytnx_double) * in->len, cudaMemcpyDeviceToDevice));

magma_int_t* ipiv;
magma_imalloc_cpu(&ipiv, L + 1);
magma_int_t N = L;
magma_int_t info;
magma_dgetrf_gpu(N, N, _in, N, ipiv, &info);
cytnx_error_msg(info != 0, "[ERROR] magma_dgetrf_gpu fail with info= %d\n", info);
cusolverDnHandle_t cusolverH;
cusolverDnCreate(&cusolverH);

int* devIpiv;
int* devInfo;
checkCudaErrors(cudaMalloc((void**)&devIpiv, L * sizeof(int)));
checkCudaErrors(cudaMalloc((void**)&devInfo, sizeof(int)));

int workspace_size = 0;
cytnx_double* workspace = NULL;
cusolverDnDgetrf_bufferSize(cusolverH, L, L, _in, L, &workspace_size);
checkCudaErrors(cudaMalloc((void**)&workspace, workspace_size * sizeof(cytnx_double)));

cusolverDnDgetrf(cusolverH, L, L, _in, L, workspace, devIpiv, devInfo);

int info;
checkCudaErrors(cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
cytnx_error_msg(info != 0, "[ERROR] cusolverDnDgetrf fail with info= %d\n", info);

// since we do unify mem, direct access element is possible:
od[0] = 1;
bool neg = 0;
for (magma_int_t i = 0; i < N; i++) {
od[0] *= _in[i * N + i];
int* ipiv = new int[L];
checkCudaErrors(cudaMemcpy(ipiv, devIpiv, L * sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < L; i++) {
od[0] *= _in[i * L + i];
if (ipiv[i] != (i + 1)) neg = !neg;
}
magma_free_cpu(ipiv);
delete[] ipiv;
cudaFree(devIpiv);
cudaFree(devInfo);
cudaFree(workspace);
cudaFree(_in);
cusolverDnDestroy(cusolverH);
if (neg) od[0] *= -1;

#else
cytnx_error_msg(true,
"[ERROR][internal Det] Det for Tensor on GPU require magma. please "
"re-compiling cytnx with magma.%s",
"\n");
#endif
}

void cuDet_internal_f(void* out, const boost::intrusive_ptr<Storage_base>& in,
const cytnx_uint64& L) {
#ifdef UNI_MAGMA
cytnx_float* od = (cytnx_float*)out; // result on cpu!
cytnx_float* _in =
(cytnx_float*)utils_internal::cuMalloc_gpu(in->len * sizeof(cytnx_float)); // unify mem.
checkCudaErrors(
cudaMemcpy(_in, in->Mem, sizeof(cytnx_float) * in->len, cudaMemcpyDeviceToDevice));

magma_int_t* ipiv;
magma_imalloc_cpu(&ipiv, L + 1);
magma_int_t N = L;
magma_int_t info;
magma_sgetrf_gpu(N, N, _in, N, ipiv, &info);
cytnx_error_msg(info != 0, "[ERROR] magma_sgetrf_gpu fail with info= %d\n", info);
cusolverDnHandle_t cusolverH;
cusolverDnCreate(&cusolverH);

int* devIpiv;
int* devInfo;
checkCudaErrors(cudaMalloc((void**)&devIpiv, L * sizeof(int)));
checkCudaErrors(cudaMalloc((void**)&devInfo, sizeof(int)));

int workspace_size = 0;
cytnx_float* workspace = NULL;
cusolverDnSgetrf_bufferSize(cusolverH, L, L, _in, L, &workspace_size);
checkCudaErrors(cudaMalloc((void**)&workspace, workspace_size * sizeof(cytnx_float)));

cusolverDnSgetrf(cusolverH, L, L, _in, L, workspace, devIpiv, devInfo);

int info;
checkCudaErrors(cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
cytnx_error_msg(info != 0, "[ERROR] cusolverDnSgetrf fail with info= %d\n", info);

// since we do unify mem, direct access element is possible:
od[0] = 1;
bool neg = 0;
for (magma_int_t i = 0; i < N; i++) {
od[0] *= _in[i * N + i];
int* ipiv = new int[L];
checkCudaErrors(cudaMemcpy(ipiv, devIpiv, L * sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < L; i++) {
od[0] *= _in[i * L + i];
if (ipiv[i] != (i + 1)) neg = !neg;
}
magma_free_cpu(ipiv);
delete[] ipiv;
cudaFree(devIpiv);
cudaFree(devInfo);
cudaFree(workspace);
cudaFree(_in);
cusolverDnDestroy(cusolverH);
if (neg) od[0] *= -1;

#else
cytnx_error_msg(true,
"[ERROR][internal Det] Det for Tensor on GPU require magma. please "
"re-compiling cytnx with magma.%s",
"\n");
#endif
}

} // namespace linalg_internal
Expand Down
Loading
Loading