From c8dc9a90157a20e75b3b76fafdac0053ba996cf9 Mon Sep 17 00:00:00 2001 From: Romain Biessy <romain.biessy@codeplay.com> Date: Tue, 29 Oct 2024 10:48:44 +0100 Subject: [PATCH 1/2] [SPARSE] Add support for cuSPARSE backend (#527) --- CMakeLists.txt | 8 +- README.md | 20 +- cmake/FindCompiler.cmake | 4 +- docs/building_the_project_with_dpcpp.rst | 8 +- docs/domains/sparse_linear_algebra.rst | 139 ++++- examples/README.md | 124 ++--- .../compile_time_dispatching/CMakeLists.txt | 21 +- ... sparse_blas_spmv_usm_mklcpu_cusparse.cpp} | 188 +++---- .../run_time_dispatching/CMakeLists.txt | 3 + include/oneapi/mkl/detail/backends.hpp | 29 +- include/oneapi/mkl/detail/backends_table.hpp | 6 + include/oneapi/mkl/sparse_blas.hpp | 3 + .../cusparse/onemkl_sparse_blas_cusparse.hpp | 33 ++ .../detail/cusparse/sparse_blas_ct.hpp | 40 ++ include/oneapi/mkl/sparse_blas/types.hpp | 1 + src/CMakeLists.txt | 1 + src/config.hpp.in | 1 + src/sparse_blas/backends/CMakeLists.txt | 4 + .../backends/cusparse/CMakeLists.txt | 85 +++ .../backends/cusparse/cusparse_error.hpp | 103 ++++ .../cusparse/cusparse_global_handle.hpp | 63 +++ .../backends/cusparse/cusparse_handles.cpp | 485 ++++++++++++++++++ .../backends/cusparse/cusparse_handles.hpp | 95 ++++ .../backends/cusparse/cusparse_helper.hpp | 166 ++++++ .../cusparse/cusparse_scope_handle.cpp | 147 ++++++ .../cusparse/cusparse_scope_handle.hpp | 88 ++++ .../backends/cusparse/cusparse_task.hpp | 431 ++++++++++++++++ .../backends/cusparse/cusparse_wrappers.cpp | 32 ++ .../cusparse/operations/cusparse_spmm.cpp | 336 ++++++++++++ .../cusparse/operations/cusparse_spmv.cpp | 335 ++++++++++++ .../cusparse/operations/cusparse_spsv.cpp | 289 +++++++++++ .../backends/mkl_common/mkl_dispatch.hpp | 37 ++ .../backends/mkl_common/mkl_handles.cxx | 217 +++----- .../backends/mkl_common/mkl_handles.hpp | 2 + .../backends/mkl_common/mkl_helper.hpp | 111 ---- .../backends/mkl_common/mkl_spmm.cxx | 122 ++--- .../backends/mkl_common/mkl_spmv.cxx | 119 ++--- .../backends/mkl_common/mkl_spsv.cxx | 98 ++-- .../backends/mklcpu/mklcpu_handles.cpp | 2 +- .../backends/mklcpu/mklcpu_operations.cpp | 4 +- .../backends/mklgpu/mklgpu_handles.cpp | 2 +- .../backends/mklgpu/mklgpu_operations.cpp | 4 +- src/sparse_blas/common_op_verification.hpp | 137 +++++ src/sparse_blas/generic_container.hpp | 94 +++- src/sparse_blas/macros.hpp | 81 +++ src/sparse_blas/sycl_helper.hpp | 80 +++ tests/unit_tests/CMakeLists.txt | 5 + tests/unit_tests/include/test_helper.hpp | 10 + tests/unit_tests/main_test.cpp | 3 +- .../sparse_blas/include/test_common.hpp | 93 +++- .../sparse_blas/include/test_spmm.hpp | 138 ++--- .../sparse_blas/include/test_spmv.hpp | 130 ++--- .../sparse_blas/include/test_spsv.hpp | 102 ++-- .../sparse_blas/source/sparse_spmm_buffer.cpp | 23 +- .../sparse_blas/source/sparse_spmm_usm.cpp | 69 ++- .../sparse_blas/source/sparse_spmv_buffer.cpp | 21 +- .../sparse_blas/source/sparse_spmv_usm.cpp | 66 +-- .../sparse_blas/source/sparse_spsv_buffer.cpp | 22 +- .../sparse_blas/source/sparse_spsv_usm.cpp | 65 +-- 59 files changed, 4113 insertions(+), 1032 deletions(-) rename examples/sparse_blas/compile_time_dispatching/{sparse_blas_spmv_usm_mklcpu.cpp => sparse_blas_spmv_usm_mklcpu_cusparse.cpp} (55%) create mode 100644 include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp create mode 100644 include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp create mode 100644 src/sparse_blas/backends/cusparse/CMakeLists.txt create mode 100644 src/sparse_blas/backends/cusparse/cusparse_error.hpp create mode 100644 src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp create mode 100644 src/sparse_blas/backends/cusparse/cusparse_handles.cpp create mode 100644 src/sparse_blas/backends/cusparse/cusparse_handles.hpp create mode 100644 src/sparse_blas/backends/cusparse/cusparse_helper.hpp create mode 100644 src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp create mode 100644 src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp create mode 100644 src/sparse_blas/backends/cusparse/cusparse_task.hpp create mode 100644 src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp create mode 100644 src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp create mode 100644 src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp create mode 100644 src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp create mode 100644 src/sparse_blas/backends/mkl_common/mkl_dispatch.hpp delete mode 100644 src/sparse_blas/backends/mkl_common/mkl_helper.hpp create mode 100644 src/sparse_blas/common_op_verification.hpp create mode 100644 src/sparse_blas/sycl_helper.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index c4cad8e17..76f5aedc2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,6 +60,9 @@ option(ENABLE_CUFFT_BACKEND "Enable the cuFFT backend for the DFT interface" OFF option(ENABLE_ROCFFT_BACKEND "Enable the rocFFT backend for the DFT interface" OFF) option(ENABLE_PORTFFT_BACKEND "Enable the portFFT DFT backend for the DFT interface. Cannot be used with other DFT backends." OFF) +# sparse +option(ENABLE_CUSPARSE_BACKEND "Enable the cuSPARSE backend for the SPARSE_BLAS interface" OFF) + set(ONEMKL_SYCL_IMPLEMENTATION "dpc++" CACHE STRING "Name of the SYCL compiler") set(HIP_TARGETS "" CACHE STRING "Target HIP architectures") @@ -102,7 +105,8 @@ if(ENABLE_MKLGPU_BACKEND list(APPEND DOMAINS_LIST "dft") endif() if(ENABLE_MKLCPU_BACKEND - OR ENABLE_MKLGPU_BACKEND) + OR ENABLE_MKLGPU_BACKEND + OR ENABLE_CUSPARSE_BACKEND) list(APPEND DOMAINS_LIST "sparse_blas") endif() @@ -129,7 +133,7 @@ if(CMAKE_CXX_COMPILER OR NOT ONEMKL_SYCL_IMPLEMENTATION STREQUAL "dpc++") string(REPLACE "\\" "/" CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER}) endif() else() - if(ENABLE_CUBLAS_BACKEND OR ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_CUFFT_BACKEND + if(ENABLE_CUBLAS_BACKEND OR ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_CUFFT_BACKEND OR ENABLE_CUSPARSE_BACKEND OR ENABLE_ROCBLAS_BACKEND OR ENABLE_ROCRAND_BACKEND OR ENABLE_ROCSOLVER_BACKEND OR ENABLE_ROCFFT_BACKEND) set(CMAKE_CXX_COMPILER "clang++") elseif(ENABLE_MKLGPU_BACKEND) diff --git a/README.md b/README.md index 5dc8c9c3b..dc023c67c 100644 --- a/README.md +++ b/README.md @@ -18,8 +18,8 @@ oneMKL is part of the [UXL Foundation](http://www.uxlfoundation.org). </thead> <tbody> <tr> - <td rowspan=12 align="center">oneMKL interface</td> - <td rowspan=12 align="center">oneMKL selector</td> + <td rowspan=13 align="center">oneMKL interface</td> + <td rowspan=13 align="center">oneMKL selector</td> <td align="center"><a href="https://software.intel.com/en-us/oneapi/onemkl">Intel(R) oneAPI Math Kernel Library (oneMKL)</a></td> <td align="center">x86 CPU, Intel GPU</td> </tr> @@ -28,10 +28,10 @@ oneMKL is part of the [UXL Foundation](http://www.uxlfoundation.org). <td align="center"><a href="https://developer.nvidia.com/cublas"> NVIDIA cuBLAS</a></td> <td align="center">NVIDIA GPU</td> </tr> - <tr> + <tr> <td align="center"><a href="https://developer.nvidia.com/cusolver"> NVIDIA cuSOLVER</a></td> <td align="center">NVIDIA GPU</td> - </tr> + </tr> <tr> <td align="center"><a href="https://developer.nvidia.com/curand"> NVIDIA cuRAND</a></td> <td align="center">NVIDIA GPU</td> @@ -40,6 +40,10 @@ oneMKL is part of the [UXL Foundation](http://www.uxlfoundation.org). <td align="center"><a href="https://developer.nvidia.com/cufft"> NVIDIA cuFFT</a></td> <td align="center">NVIDIA GPU</td> </tr> + <tr> + <td align="center"><a href="https://developer.nvidia.com/cusparse"> NVIDIA cuSPARSE</a></td> + <td align="center">NVIDIA GPU</td> + </tr> <tr> <td align="center"><a href="https://ww.netlib.org"> NETLIB LAPACK</a> </td> <td align="center">x86 CPU</td> @@ -329,7 +333,7 @@ Supported compilers include: <td align="center">Dynamic, Static</td> </tr> <tr> - <td rowspan=2 align="center">SPARSE_BLAS</td> + <td rowspan=3 align="center">SPARSE_BLAS</td> <td align="center">x86 CPU</td> <td align="center">Intel(R) oneMKL</td> <td align="center">Intel DPC++</td> @@ -341,6 +345,12 @@ Supported compilers include: <td align="center">Intel DPC++</td> <td align="center">Dynamic, Static</td> </tr> + <tr> + <td align="center">NVIDIA GPU</td> + <td align="center">NVIDIA cuSPARSE</td> + <td align="center">Open DPC++</td> + <td align="center">Dynamic, Static</td> + </tr> </tbody> </table> diff --git a/cmake/FindCompiler.cmake b/cmake/FindCompiler.cmake index 556211999..8aefc2623 100644 --- a/cmake/FindCompiler.cmake +++ b/cmake/FindCompiler.cmake @@ -37,7 +37,7 @@ if(is_dpcpp) # Check if the Nvidia target is supported. PortFFT uses this for choosing default configuration. check_cxx_compiler_flag("-fsycl -fsycl-targets=nvptx64-nvidia-cuda" dpcpp_supports_nvptx64) - if(ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND) + if(ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_CUSPARSE_BACKEND) list(APPEND UNIX_INTERFACE_COMPILE_OPTIONS -fsycl-targets=nvptx64-nvidia-cuda -fsycl-unnamed-lambda) list(APPEND UNIX_INTERFACE_LINK_OPTIONS @@ -51,7 +51,7 @@ if(is_dpcpp) -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${HIP_TARGETS}) endif() - if(ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_ROCBLAS_BACKEND + if(ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_CUSPARSE_BACKEND OR ENABLE_ROCBLAS_BACKEND OR ENABLE_ROCRAND_BACKEND OR ENABLE_ROCSOLVER_BACKEND) set_target_properties(ONEMKL::SYCL::SYCL PROPERTIES INTERFACE_COMPILE_OPTIONS "${UNIX_INTERFACE_COMPILE_OPTIONS}" diff --git a/docs/building_the_project_with_dpcpp.rst b/docs/building_the_project_with_dpcpp.rst index 6076117f7..efe92f285 100644 --- a/docs/building_the_project_with_dpcpp.rst +++ b/docs/building_the_project_with_dpcpp.rst @@ -104,6 +104,9 @@ The most important supported build options are: * - ENABLE_CURAND_BACKEND - True, False - False + * - ENABLE_CUSPARSE_BACKEND + - True, False + - False * - ENABLE_NETLIB_BACKEND - True, False - False @@ -183,8 +186,8 @@ Building for CUDA ^^^^^^^^^^^^^^^^^ The CUDA backends can be enabled with ``ENABLE_CUBLAS_BACKEND``, -``ENABLE_CUFFT_BACKEND``, ``ENABLE_CURAND_BACKEND``, and -``ENABLE_CUSOLVER_BACKEND``. +``ENABLE_CUFFT_BACKEND``, ``ENABLE_CURAND_BACKEND``, +``ENABLE_CUSOLVER_BACKEND``, and ``ENABLE_CUSPARSE_BACKEND``. No additional parameters are required for using CUDA libraries. In most cases, the CUDA libraries should be found automatically by CMake. @@ -371,6 +374,7 @@ disabled using the Ninja build system: -DENABLE_CUBLAS_BACKEND=True \ -DENABLE_CUSOLVER_BACKEND=True \ -DENABLE_CURAND_BACKEND=True \ + -DENABLE_CUSPARSE_BACKEND=True \ -DBUILD_FUNCTIONAL_TESTS=False ``$ONEMKL_DIR`` points at the oneMKL source directly. The x86 CPU (``MKLCPU``) diff --git a/docs/domains/sparse_linear_algebra.rst b/docs/domains/sparse_linear_algebra.rst index eab5afd56..07d90359a 100644 --- a/docs/domains/sparse_linear_algebra.rst +++ b/docs/domains/sparse_linear_algebra.rst @@ -20,21 +20,150 @@ Currently known limitations: - ``oneapi::mkl::sparse::set_csr_data`` and ``oneapi::mkl::sparse::set_coo_data`` functions cannot be used on a handle that has already been used for an operation or its optimize function. Doing so - will throw an ``oneapi::mkl::unimplemented`` exception. + will throw a ``oneapi::mkl::unimplemented`` exception. - Using ``spsv`` with the ``oneapi::mkl::sparse::spsv_alg::no_optimize_alg`` and a sparse matrix that does not have the - ``oneapi::mkl::sparse::matrix_property::sorted`` property will throw an + ``oneapi::mkl::sparse::matrix_property::sorted`` property will throw a ``oneapi::mkl::unimplemented`` exception. - Using ``spmm`` on Intel GPU with a sparse matrix that is ``oneapi::mkl::transpose::conjtrans`` and has the - ``oneapi::mkl::sparse::matrix_property::symmetric`` property will throw an + ``oneapi::mkl::sparse::matrix_property::symmetric`` property will throw a ``oneapi::mkl::unimplemented`` exception. - Using ``spmv`` with a sparse matrix that is ``oneapi::mkl::transpose::conjtrans`` with a ``type_view`` - ``matrix_descr::symmetric`` or ``matrix_descr::hermitian`` will throw an + ``matrix_descr::symmetric`` or ``matrix_descr::hermitian`` will throw a ``oneapi::mkl::unimplemented`` exception. - Using ``spsv`` on Intel GPU with a sparse matrix that is - ``oneapi::mkl::transpose::conjtrans`` and will throw an + ``oneapi::mkl::transpose::conjtrans`` and will throw a ``oneapi::mkl::unimplemented`` exception. - Scalar parameters ``alpha`` and ``beta`` should be host pointers to prevent synchronizations and copies to the host. + + +cuSPARSE backend +---------------- + +Currently known limitations: + +- The COO format requires the indices to be sorted by row. See the `cuSPARSE + documentation + <https://docs.nvidia.com/cuda/cusparse/index.html#coordinate-coo>`_. Sparse + operations using matrices with the COO format without the property + ``matrix_property::sorted_by_rows`` or ``matrix_property::sorted`` will throw + a ``oneapi::mkl::unimplemented`` exception. +- Using ``spmm`` with the algorithm ``spmm_alg::csr_alg3`` and an ``opA`` other + than ``transpose::nontrans`` or an ``opB`` ``transpose::conjtrans`` will throw + a ``oneapi::mkl::unimplemented`` exception. +- Using ``spmm`` with the algorithm ``spmm_alg::csr_alg3``, + ``opB=transpose::trans`` and real fp64 precision will throw a + ``oneapi::mkl::unimplemented`` exception. This configuration can fail as of + CUDA 12.6.2, see the related issue + `here<https://forums.developer.nvidia.com/t/cusparse-spmm-sample-failing-with-misaligned-address/311022>`_. +- Using ``spmv`` with a ``type_view`` other than ``matrix_descr::general`` will + throw a ``oneapi::mkl::unimplemented`` exception. +- Using ``spsv`` with the algorithm ``spsv_alg::no_optimize_alg`` may still + perform some mandatory preprocessing. +- oneMKL Interface does not provide a way to use non-default algorithms without + calling preprocess functions such as ``cusparseSpMM_preprocess`` or + ``cusparseSpMV_preprocess``. Feel free to create an issue if this is needed. + + +Operation algorithms mapping +---------------------------- + +The following tables describe how a oneMKL SYCL Interface algorithm maps to the +backend's algorithms. Refer to the backend's documentation for a more detailed +explanation of the algorithms. + +Backends with no equivalent algorithms will fallback to the backend's default +behavior. + + +spmm +^^^^ + +.. list-table:: + :header-rows: 1 + :widths: 10 30 45 + + * - ``spmm_alg`` value + - MKLCPU/MKLGPU + - cuSPARSE + * - ``default_alg`` + - none + - ``CUSPARSE_SPMM_ALG_DEFAULT`` + * - ``no_optimize_alg`` + - none + - ``CUSPARSE_SPMM_ALG_DEFAULT`` + * - ``coo_alg1`` + - none + - ``CUSPARSE_SPMM_COO_ALG1`` + * - ``coo_alg2`` + - none + - ``CUSPARSE_SPMM_COO_ALG2`` + * - ``coo_alg3`` + - none + - ``CUSPARSE_SPMM_COO_ALG3`` + * - ``coo_alg4`` + - none + - ``CUSPARSE_SPMM_COO_ALG4`` + * - ``csr_alg1`` + - none + - ``CUSPARSE_SPMM_CSR_ALG1`` + * - ``csr_alg2`` + - none + - ``CUSPARSE_SPMM_CSR_ALG2`` + * - ``csr_alg3`` + - none + - ``CUSPARSE_SPMM_CSR_ALG3`` + + +spmv +^^^^ + +.. list-table:: + :header-rows: 1 + :widths: 10 30 45 + + * - ``spmv_alg`` value + - MKLCPU/MKLGPU + - cuSPARSE + * - ``default_alg`` + - none + - ``CUSPARSE_SPMV_ALG_DEFAULT`` + * - ``no_optimize_alg`` + - none + - ``CUSPARSE_SPMV_ALG_DEFAULT`` + * - ``coo_alg1`` + - none + - ``CUSPARSE_SPMV_COO_ALG1`` + * - ``coo_alg2`` + - none + - ``CUSPARSE_SPMV_COO_ALG2`` + * - ``csr_alg1`` + - none + - ``CUSPARSE_SPMV_CSR_ALG1`` + * - ``csr_alg2`` + - none + - ``CUSPARSE_SPMV_CSR_ALG2`` + * - ``csr_alg3`` + - none + - ``CUSPARSE_SPMV_ALG_DEFAULT`` + + +spsv +^^^^ + +.. list-table:: + :header-rows: 1 + :widths: 10 30 45 + + * - ``spsv_alg`` value + - MKLCPU/MKLGPU + - cuSPARSE + * - ``default_alg`` + - none + - ``CUSPARSE_SPSV_ALG_DEFAULT`` + * - ``no_optimize_alg`` + - none + - ``CUSPARSE_SPSV_ALG_DEFAULT`` diff --git a/examples/README.md b/examples/README.md index 0dad8772d..45a100131 100644 --- a/examples/README.md +++ b/examples/README.md @@ -4,7 +4,7 @@ oneAPI Math Kernel Library (oneMKL) Interfaces offers examples with the followin - rng: uniform_usm - lapack: getrs_usm - dft: complex_fwd_usm, real_fwd_usm -- sparse_blas: sparse_gemv_usm +- sparse_blas: sparse_spmv_usm Each routine has one run-time dispatching example and one compile-time dispatching example (which uses both mklcpu and cuda backends), located in `example/<$domain>/run_time_dispatching` and `example/<$domain>/compile_time_dispatching` subfolders, respectively. @@ -487,111 +487,119 @@ Unsupported Configuration: Run-time dispatching examples with mklcpu backend ``` $ export ONEAPI_DEVICE_SELECTOR="opencl:cpu" -$ ./bin/example_sparse_blas_gemv_usm +$ ./bin/example_sparse_blas_spmv_usm ######################################################################## -# Sparse Matrix-Vector Multiply Example: -# +# Sparse Matrix-Vector Multiply Example: +# # y = alpha * op(A) * x + beta * y -# +# # where A is a sparse matrix in CSR format, x and y are dense vectors # and alpha, beta are floating point type precision scalars. -# +# # Using apis: -# sparse::gemv -# +# sparse::spmv +# # Using single precision (float) data type -# +# # Device will be selected during runtime. # The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify # available devices -# +# ######################################################################## -Running Sparse BLAS GEMV USM example on CPU device. -Device name is: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz +Running Sparse BLAS SPMV USM example on CPU device. +Device name is: Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz Running with single precision real data type: - sparse::gemv parameters: - transA = nontrans - nrows = 64 - alpha = 1, beta = 0 + sparse::spmv parameters: + transA = nontrans + nrows = 64 + alpha = 1, beta = 0 - sparse::gemv example passed - Finished -Sparse BLAS GEMV USM example ran OK. + sparse::spmv example passed + Finished +Sparse BLAS SPMV USM example ran OK. ``` Run-time dispatching examples with mklgpu backend ``` $ export ONEAPI_DEVICE_SELECTOR="level_zero:gpu" -$ ./bin/example_sparse_blas_gemv_usm +$ ./bin/example_sparse_blas_spmv_usm ######################################################################## -# Sparse Matrix-Vector Multiply Example: -# +# Sparse Matrix-Vector Multiply Example: +# # y = alpha * op(A) * x + beta * y -# +# # where A is a sparse matrix in CSR format, x and y are dense vectors # and alpha, beta are floating point type precision scalars. -# +# # Using apis: -# sparse::gemv -# +# sparse::spmv +# # Using single precision (float) data type -# +# # Device will be selected during runtime. # The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify # available devices -# +# ######################################################################## -Running Sparse BLAS GEMV USM example on GPU device. +Running Sparse BLAS SPMV USM example on GPU device. Device name is: Intel(R) HD Graphics 530 [0x1912] Running with single precision real data type: - sparse::gemv parameters: - transA = nontrans - nrows = 64 - alpha = 1, beta = 0 + sparse::spmv parameters: + transA = nontrans + nrows = 64 + alpha = 1, beta = 0 - sparse::gemv example passed - Finished -Sparse BLAS GEMV USM example ran OK. + sparse::spmv example passed + Finished +Sparse BLAS SPMV USM example ran OK. ``` -Compile-time dispatching example with mklcpu backend +Compile-time dispatching example with both mklcpu and cusparse backend ``` -$ export ONEAPI_DEVICE_SELECTOR="opencl:cpu" -$ ./bin/example_sparse_blas_gemv_usm_mklcpu +$ ./bin/sparse_blas_spmv_usm_mklcpu_cusparse ######################################################################## -# Sparse Matrix-Vector Multiply Example: -# +# Sparse Matrix-Vector Multiply Example: +# # y = alpha * op(A) * x + beta * y -# -# where A is a sparse matrix in CSR format, x and y are dense vectors +# +# where A is a sparse matrix in COO format, x and y are dense vectors # and alpha, beta are floating point type precision scalars. -# +# # Using apis: -# sparse::gemv -# +# sparse::spmv +# # Using single precision (float) data type -# -# Running on Intel CPU device -# +# +# Running on both Intel CPU and Nvidia GPU devices +# ######################################################################## -Running Sparse BLAS GEMV USM example on CPU device. -Device name is: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz +Running Sparse BLAS SPMV USM example on: + CPU device: Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz + GPU device: NVIDIA A100-PCIE-40GB Running with single precision real data type: - sparse::gemv parameters: - transA = nontrans - nrows = 64 - alpha = 1, beta = 0 + sparse::spmv parameters: + transA = nontrans + size = 8 + alpha = 1, beta = 0 + + sparse::spmv example passed + Finished + + sparse::spmv parameters: + transA = nontrans + size = 8 + alpha = 1, beta = 0 - sparse::gemv example passed - Finished -Sparse BLAS GEMV USM example ran OK. + sparse::spmv example passed + Finished +Sparse BLAS SPMV USM example ran OK on MKLCPU and CUSPARSE. ``` diff --git a/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt b/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt index 5dbbba8a4..a38f4ebd4 100644 --- a/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt +++ b/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt @@ -18,27 +18,24 @@ #=============================================================================== #Build object from all sources -set(SPARSE_BLAS_BACKENDS "") - -if(ENABLE_MKLCPU_BACKEND) - list(APPEND SPARSE_BLAS_BACKENDS "mklcpu") +set(SPARSE_CT_SOURCES "") +if(ENABLE_MKLCPU_BACKEND AND ENABLE_CUSPARSE_BACKEND) + list(APPEND SPARSE_CT_SOURCES "sparse_blas_spmv_usm_mklcpu_cusparse") endif() include(WarningsUtils) -foreach(backend ${SPARSE_BLAS_BACKENDS}) - set(EXAMPLE_NAME example_sparse_blas_spmv_usm_${backend}) - add_executable(${EXAMPLE_NAME} sparse_blas_spmv_usm_${backend}.cpp) - target_include_directories(${EXAMPLE_NAME} +foreach(sparse_ct_source ${SPARSE_CT_SOURCES}) + add_executable(${sparse_ct_source} ${sparse_ct_source}.cpp) + target_include_directories(${sparse_ct_source} PUBLIC ${PROJECT_SOURCE_DIR}/examples/include PUBLIC ${PROJECT_SOURCE_DIR}/include PUBLIC ${CMAKE_BINARY_DIR}/bin ) - add_dependencies(${EXAMPLE_NAME} onemkl_sparse_blas_${backend}) - target_link_libraries(${EXAMPLE_NAME} PRIVATE ONEMKL::SYCL::SYCL onemkl_sparse_blas_${backend}) + target_link_libraries(${sparse_ct_source} PRIVATE ONEMKL::SYCL::SYCL onemkl_sparse_blas_mklcpu onemkl_sparse_blas_cusparse) # Register example as ctest - add_test(NAME sparse_blas/EXAMPLE/CT/sparse_blas_spmv_usm_${backend} COMMAND ${EXAMPLE_NAME}) -endforeach(backend) + add_test(NAME sparse_blas/EXAMPLE/CT/${sparse_ct_source} COMMAND ${sparse_ct_source}) +endforeach(sparse_ct_source) diff --git a/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu.cpp b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp similarity index 55% rename from examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu.cpp rename to examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp index 964afb49b..31ce1975c 100644 --- a/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu.cpp +++ b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp @@ -22,7 +22,7 @@ * Content: * This example demonstrates use of DPCPP API oneapi::mkl::sparse::spmv * using unified shared memory to perform general sparse matrix-vector -* multiplication on a INTEL CPU SYCL device. +* multiplication on a INTEL CPU SYCL device and an NVIDIA GPU SYCL device. * * y = alpha * op(A) * x + beta * y * @@ -59,69 +59,54 @@ // // is performed and finally the results are post processed. // -template <typename fp, typename intType> -int run_sparse_matrix_vector_multiply_example(const sycl::device& cpu_dev) { +template <typename fpType, typename intType, typename selectorType> +int run_sparse_matrix_vector_multiply_example(selectorType& selector) { + auto queue = selector.get_queue(); + // Matrix data size - intType size = 4; - intType nrows = size * size * size; + static constexpr intType size = 8; - // Set scalar fp values - fp alpha = set_fp_value(fp(1.0)); - fp beta = set_fp_value(fp(0.0)); + // Set scalar fpType values + fpType alpha = set_fp_value(fpType(1.0)); + fpType beta = set_fp_value(fpType(0.0)); - // Catch asynchronous exceptions - auto exception_handler = [](sycl::exception_list exceptions) { - for (std::exception_ptr const& e : exceptions) { - try { - std::rethrow_exception(e); - } - catch (sycl::exception const& e) { - std::cout << "Caught asynchronous SYCL " - "exception during sparse::spmv:\n" - << e.what() << std::endl; - } - } - }; + intType nnz = 9; + // host_ia must be sorted to maintain the sorted_by_rows property + intType host_ia[] = { 0, 0, 1, 3, 4, 4, 4, 7, 7 }; + intType host_ja[] = { 0, 7, 2, 2, 5, 4, 0, 0, 7 }; + + intType* ia = (intType*)sycl::malloc_shared(nnz * sizeof(intType), queue); + intType* ja = (intType*)sycl::malloc_shared(nnz * sizeof(intType), queue); + fpType* a = (fpType*)sycl::malloc_shared(nnz * sizeof(fpType), queue); + fpType* x = (fpType*)sycl::malloc_shared(size * sizeof(fpType), queue); + fpType* y = (fpType*)sycl::malloc_shared(size * sizeof(fpType), queue); - // create execution queue and buffers of matrix data - sycl::queue cpu_queue(cpu_dev, exception_handler); - oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu> cpu_selector{ cpu_queue }; - - intType *ia, *ja; - fp *a, *x, *y, *z; - std::size_t sizea = static_cast<std::size_t>(27 * nrows); - std::size_t sizeja = static_cast<std::size_t>(27 * nrows); - std::size_t sizeia = static_cast<std::size_t>(nrows + 1); - std::size_t sizevec = static_cast<std::size_t>(nrows); - - ia = (intType*)sycl::malloc_shared(sizeia * sizeof(intType), cpu_queue); - ja = (intType*)sycl::malloc_shared(sizeja * sizeof(intType), cpu_queue); - a = (fp*)sycl::malloc_shared(sizea * sizeof(fp), cpu_queue); - x = (fp*)sycl::malloc_shared(sizevec * sizeof(fp), cpu_queue); - y = (fp*)sycl::malloc_shared(sizevec * sizeof(fp), cpu_queue); - z = (fp*)sycl::malloc_shared(sizevec * sizeof(fp), cpu_queue); - - if (!ia || !ja || !a || !x || !y || !z) { + if (!ia || !ja || !a || !x || !y) { throw std::runtime_error("Failed to allocate USM memory"); } - intType nnz = generate_sparse_matrix<fp, intType>(size, ia, ja, a); + // Copy ia and ja + queue.memcpy(ia, host_ia, nnz * sizeof(intType)).wait_and_throw(); + queue.memcpy(ja, host_ja, nnz * sizeof(intType)).wait_and_throw(); + + // Init matrix values + for (int i = 0; i < nnz; i++) { + a[i] = set_fp_value(fpType(i + 1)); + } // Init vectors x and y - for (int i = 0; i < nrows; i++) { - x[i] = set_fp_value(fp(1.0)); - y[i] = set_fp_value(fp(0.0)); - z[i] = set_fp_value(fp(0.0)); + for (int i = 0; i < size; i++) { + x[i] = set_fp_value(fpType(i + 1)); + y[i] = set_fp_value(fpType(0.0)); } std::vector<intType*> int_ptr_vec; int_ptr_vec.push_back(ia); int_ptr_vec.push_back(ja); - std::vector<fp*> fp_ptr_vec; + std::vector<fpType*> fp_ptr_vec; fp_ptr_vec.push_back(a); fp_ptr_vec.push_back(x); fp_ptr_vec.push_back(y); - fp_ptr_vec.push_back(z); // // Execute Matrix Multiply @@ -137,49 +122,52 @@ int run_sparse_matrix_vector_multiply_example(const sycl::device& cpu_dev) { ? "nontrans" : (transA == oneapi::mkl::transpose::trans ? "trans" : "conjtrans")) << std::endl; - std::cout << "\t\t\tnrows = " << nrows << std::endl; + std::cout << "\t\t\tsize = " << size << std::endl; std::cout << "\t\t\talpha = " << alpha << ", beta = " << beta << std::endl; - // Create and initialize handle for a Sparse Matrix in CSR format + // Create and initialize handle for a Sparse Matrix in COO format sorted by rows oneapi::mkl::sparse::matrix_handle_t A_handle = nullptr; - oneapi::mkl::sparse::init_csr_matrix(cpu_selector, &A_handle, nrows, nrows, nnz, + oneapi::mkl::sparse::init_coo_matrix(selector, &A_handle, size, size, nnz, oneapi::mkl::index_base::zero, ia, ja, a); + // cuSPARSE backend requires that the property sorted_by_rows or sorted is set when using matrices in COO format. + // Setting these properties is also the best practice to get best performance. + oneapi::mkl::sparse::set_matrix_property(selector, A_handle, + oneapi::mkl::sparse::matrix_property::sorted_by_rows); // Create and initialize dense vector handles oneapi::mkl::sparse::dense_vector_handle_t x_handle = nullptr; oneapi::mkl::sparse::dense_vector_handle_t y_handle = nullptr; - oneapi::mkl::sparse::init_dense_vector(cpu_selector, &x_handle, sizevec, x); - oneapi::mkl::sparse::init_dense_vector(cpu_selector, &y_handle, sizevec, y); + oneapi::mkl::sparse::init_dense_vector(selector, &x_handle, size, x); + oneapi::mkl::sparse::init_dense_vector(selector, &y_handle, size, y); // Create operation descriptor oneapi::mkl::sparse::spmv_descr_t descr = nullptr; - oneapi::mkl::sparse::init_spmv_descr(cpu_selector, &descr); + oneapi::mkl::sparse::init_spmv_descr(selector, &descr); // Allocate external workspace std::size_t workspace_size = 0; - oneapi::mkl::sparse::spmv_buffer_size(cpu_selector, transA, &alpha, A_view, A_handle, x_handle, + oneapi::mkl::sparse::spmv_buffer_size(selector, transA, &alpha, A_view, A_handle, x_handle, &beta, y_handle, alg, descr, workspace_size); - void* workspace = sycl::malloc_device(workspace_size, cpu_queue); + void* workspace = sycl::malloc_device(workspace_size, queue); // Optimize spmv auto ev_opt = - oneapi::mkl::sparse::spmv_optimize(cpu_selector, transA, &alpha, A_view, A_handle, x_handle, + oneapi::mkl::sparse::spmv_optimize(selector, transA, &alpha, A_view, A_handle, x_handle, &beta, y_handle, alg, descr, workspace); // Run spmv - auto ev_spmv = oneapi::mkl::sparse::spmv(cpu_selector, transA, &alpha, A_view, A_handle, - x_handle, &beta, y_handle, alg, descr, { ev_opt }); + auto ev_spmv = oneapi::mkl::sparse::spmv(selector, transA, &alpha, A_view, A_handle, x_handle, + &beta, y_handle, alg, descr, { ev_opt }); // Release handles and descriptor std::vector<sycl::event> release_events; release_events.push_back( - oneapi::mkl::sparse::release_dense_vector(cpu_selector, x_handle, { ev_spmv })); + oneapi::mkl::sparse::release_dense_vector(selector, x_handle, { ev_spmv })); release_events.push_back( - oneapi::mkl::sparse::release_dense_vector(cpu_selector, y_handle, { ev_spmv })); + oneapi::mkl::sparse::release_dense_vector(selector, y_handle, { ev_spmv })); release_events.push_back( - oneapi::mkl::sparse::release_sparse_matrix(cpu_selector, A_handle, { ev_spmv })); - release_events.push_back( - oneapi::mkl::sparse::release_spmv_descr(cpu_selector, descr, { ev_spmv })); + oneapi::mkl::sparse::release_sparse_matrix(selector, A_handle, { ev_spmv })); + release_events.push_back(oneapi::mkl::sparse::release_spmv_descr(selector, descr, { ev_spmv })); for (auto event : release_events) { event.wait_and_throw(); } @@ -188,33 +176,26 @@ int run_sparse_matrix_vector_multiply_example(const sycl::device& cpu_dev) { // Post Processing // - fp* res = y; - const bool isConj = (transA == oneapi::mkl::transpose::conjtrans); - for (intType row = 0; row < nrows; row++) { - z[row] *= beta; - } - for (intType row = 0; row < nrows; row++) { - fp tmp = alpha * x[row]; - for (intType i = ia[row]; i < ia[row + 1]; i++) { - if constexpr (is_complex<fp>()) { - z[ja[i]] += tmp * (isConj ? std::conj(a[i]) : a[i]); - } - else { - z[ja[i]] += tmp * a[i]; - } - } + // The example assume matrices are not transposed and beta=0 for simplicity. + // See the tests for more in-depth verification. + fpType* res = y; + fpType expected_res[size] = {}; + for (intType i = 0; i < nnz; ++i) { + intType row = ia[i]; + intType col = ja[i]; + expected_res[row] += alpha * x[col] * a[i]; } bool good = true; - for (intType row = 0; row < nrows; row++) { - good &= check_result(res[row], z[row], nrows, row); + for (intType row = 0; row < size; row++) { + good &= check_result(res[row], expected_res[row], size, row); } std::cout << "\n\t\t sparse::spmv example " << (good ? "passed" : "failed") << "\n\tFinished" << std::endl; - free_vec(fp_ptr_vec, cpu_queue); - free_vec(int_ptr_vec, cpu_queue); + free_vec(fp_ptr_vec, queue); + free_vec(int_ptr_vec, queue); if (!good) return 1; @@ -234,7 +215,7 @@ void print_example_banner() { std::cout << "# " << std::endl; std::cout << "# y = alpha * op(A) * x + beta * y" << std::endl; std::cout << "# " << std::endl; - std::cout << "# where A is a sparse matrix in CSR format, x and y are " + std::cout << "# where A is a sparse matrix in COO format, x and y are " "dense vectors" << std::endl; std::cout << "# and alpha, beta are floating point type precision scalars." << std::endl; @@ -244,7 +225,7 @@ void print_example_banner() { std::cout << "# " << std::endl; std::cout << "# Using single precision (float) data type" << std::endl; std::cout << "# " << std::endl; - std::cout << "# Running on Intel CPU device" << std::endl; + std::cout << "# Running on both Intel CPU and Nvidia GPU devices" << std::endl; std::cout << "# " << std::endl; std::cout << "########################################################################" << std::endl; @@ -257,17 +238,44 @@ void print_example_banner() { int main(int /*argc*/, char** /*argv*/) { print_example_banner(); + auto exception_handler = [](sycl::exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (sycl::exception const& e) { + std::cout << "Caught asynchronous SYCL " + "exception during sparse::spmv:\n" + << e.what() << std::endl; + } + } + }; + try { - // TODO: Add cuSPARSE compile-time dispatcher in this example once it is supported. - sycl::device cpu_dev(sycl::cpu_selector_v); + sycl::queue cpu_queue(sycl::cpu_selector_v, exception_handler); + sycl::queue gpu_queue(sycl::gpu_selector_v, exception_handler); + unsigned int vendor_id = gpu_queue.get_device().get_info<sycl::info::device::vendor_id>(); + if (vendor_id != NVIDIA_ID) { + std::cerr << "FAILED: NVIDIA GPU device not found" << std::endl; + return 1; + } + oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu> cpu_selector{ cpu_queue }; + oneapi::mkl::backend_selector<oneapi::mkl::backend::cusparse> gpu_selector{ gpu_queue }; - std::cout << "Running Sparse BLAS SPMV USM example on CPU device." << std::endl; - std::cout << "Device name is: " << cpu_dev.get_info<sycl::info::device::name>() + std::cout << "Running Sparse BLAS SPMV USM example on:" << std::endl; + std::cout << "\tCPU device: " << cpu_queue.get_device().get_info<sycl::info::device::name>() + << std::endl; + std::cout << "\tGPU device: " << gpu_queue.get_device().get_info<sycl::info::device::name>() << std::endl; std::cout << "Running with single precision real data type:" << std::endl; - run_sparse_matrix_vector_multiply_example<float, std::int32_t>(cpu_dev); - std::cout << "Sparse BLAS SPMV USM example ran OK." << std::endl; + int err = run_sparse_matrix_vector_multiply_example<float, std::int32_t>(cpu_selector); + if (err) + return err; + err = run_sparse_matrix_vector_multiply_example<float, std::int32_t>(gpu_selector); + if (err) + return err; + std::cout << "Sparse BLAS SPMV USM example ran OK on MKLCPU and CUSPARSE." << std::endl; } catch (sycl::exception const& e) { std::cerr << "Caught synchronous SYCL exception during Sparse SPMV:" << std::endl; diff --git a/examples/sparse_blas/run_time_dispatching/CMakeLists.txt b/examples/sparse_blas/run_time_dispatching/CMakeLists.txt index 398f3e0f2..f09daf819 100644 --- a/examples/sparse_blas/run_time_dispatching/CMakeLists.txt +++ b/examples/sparse_blas/run_time_dispatching/CMakeLists.txt @@ -33,6 +33,9 @@ endif() if(ENABLE_MKLGPU_BACKEND) list(APPEND DEVICE_FILTERS "level_zero:gpu") endif() +if(ENABLE_CUSPARSE_BACKEND) + list(APPEND DEVICE_FILTERS "cuda:gpu") +endif() message(STATUS "ONEAPI_DEVICE_SELECTOR will be set to the following value(s): [${DEVICE_FILTERS}] for run-time dispatching examples") diff --git a/include/oneapi/mkl/detail/backends.hpp b/include/oneapi/mkl/detail/backends.hpp index 32b7c2614..216a6feba 100644 --- a/include/oneapi/mkl/detail/backends.hpp +++ b/include/oneapi/mkl/detail/backends.hpp @@ -40,20 +40,31 @@ enum class backend { cufft, rocfft, portfft, + cusparse, unsupported }; typedef std::map<backend, std::string> backendmap; -static backendmap backend_map = { - { backend::mklcpu, "mklcpu" }, { backend::mklgpu, "mklgpu" }, - { backend::cublas, "cublas" }, { backend::cusolver, "cusolver" }, - { backend::curand, "curand" }, { backend::netlib, "netlib" }, - { backend::rocblas, "rocblas" }, { backend::rocrand, "rocrand" }, - { backend::rocsolver, "rocsolver" }, { backend::portblas, "portblas" }, - { backend::cufft, "cufft" }, { backend::rocfft, "rocfft" }, - { backend::portfft, "portfft" }, { backend::unsupported, "unsupported" } -}; +// clang-format alternate the formatting depending on the parity of the number of backends +// It is disabled to reduce noise +// clang-format off +static backendmap backend_map = { { backend::mklcpu, "mklcpu" }, + { backend::mklgpu, "mklgpu" }, + { backend::cublas, "cublas" }, + { backend::cusolver, "cusolver" }, + { backend::curand, "curand" }, + { backend::netlib, "netlib" }, + { backend::rocblas, "rocblas" }, + { backend::rocrand, "rocrand" }, + { backend::rocsolver, "rocsolver" }, + { backend::portblas, "portblas" }, + { backend::cufft, "cufft" }, + { backend::rocfft, "rocfft" }, + { backend::portfft, "portfft" }, + { backend::cusparse, "cusparse" }, + { backend::unsupported, "unsupported" } }; +// clang-format on } //namespace mkl } //namespace oneapi diff --git a/include/oneapi/mkl/detail/backends_table.hpp b/include/oneapi/mkl/detail/backends_table.hpp index 731781375..9b7c921d6 100644 --- a/include/oneapi/mkl/detail/backends_table.hpp +++ b/include/oneapi/mkl/detail/backends_table.hpp @@ -198,6 +198,12 @@ static std::map<domain, std::map<device, std::vector<const char*>>> libraries = { #ifdef ONEMKL_ENABLE_MKLGPU_BACKEND LIB_NAME("sparse_blas_mklgpu") +#endif + } }, + { device::nvidiagpu, + { +#ifdef ONEMKL_ENABLE_CUSPARSE_BACKEND + LIB_NAME("sparse_blas_cusparse") #endif } } } }, }; diff --git a/include/oneapi/mkl/sparse_blas.hpp b/include/oneapi/mkl/sparse_blas.hpp index 004b79727..8fb86f244 100644 --- a/include/oneapi/mkl/sparse_blas.hpp +++ b/include/oneapi/mkl/sparse_blas.hpp @@ -34,6 +34,9 @@ #ifdef ONEMKL_ENABLE_MKLGPU_BACKEND #include "sparse_blas/detail/mklgpu/sparse_blas_ct.hpp" #endif +#ifdef ONEMKL_ENABLE_CUSPARSE_BACKEND +#include "sparse_blas/detail/cusparse/sparse_blas_ct.hpp" +#endif #include "sparse_blas/detail/sparse_blas_rt.hpp" diff --git a/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp b/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp new file mode 100644 index 000000000..c8e816eeb --- /dev/null +++ b/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp @@ -0,0 +1,33 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#ifndef _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_ONEMKL_SPARSE_BLAS_CUSPARSE_HPP_ +#define _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_ONEMKL_SPARSE_BLAS_CUSPARSE_HPP_ + +#include "oneapi/mkl/detail/export.hpp" +#include "oneapi/mkl/sparse_blas/detail/helper_types.hpp" +#include "oneapi/mkl/sparse_blas/types.hpp" + +namespace oneapi::mkl::sparse::cusparse { + +#include "oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx" + +} // namespace oneapi::mkl::sparse::cusparse + +#endif // _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_ONEMKL_SPARSE_BLAS_CUSPARSE_HPP_ diff --git a/include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp b/include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp new file mode 100644 index 000000000..11abb9a6f --- /dev/null +++ b/include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp @@ -0,0 +1,40 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#ifndef _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_SPARSE_BLAS_CT_HPP_ +#define _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_SPARSE_BLAS_CT_HPP_ + +#include "oneapi/mkl/detail/backends.hpp" +#include "oneapi/mkl/detail/backend_selector.hpp" + +#include "onemkl_sparse_blas_cusparse.hpp" + +namespace oneapi { +namespace mkl { +namespace sparse { + +#define BACKEND cusparse +#include "oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx" +#undef BACKEND + +} //namespace sparse +} //namespace mkl +} //namespace oneapi + +#endif // _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_SPARSE_BLAS_CT_HPP_ diff --git a/include/oneapi/mkl/sparse_blas/types.hpp b/include/oneapi/mkl/sparse_blas/types.hpp index d619be4b3..1a50d6ef4 100644 --- a/include/oneapi/mkl/sparse_blas/types.hpp +++ b/include/oneapi/mkl/sparse_blas/types.hpp @@ -36,6 +36,7 @@ namespace sparse { enum class matrix_property { symmetric, sorted, + sorted_by_rows, }; enum class spmm_alg { diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6ff8d5d11..c363d8a8d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -59,6 +59,7 @@ function(generate_header_file) set(ONEMKL_ENABLE_CUFFT_BACKEND ${ENABLE_CUFFT_BACKEND}) set(ONEMKL_ENABLE_ROCFFT_BACKEND ${ENABLE_ROCFFT_BACKEND}) set(ONEMKL_ENABLE_PORTFFT_BACKEND ${ENABLE_PORTFFT_BACKEND}) + set(ONEMKL_ENABLE_CUSPARSE_BACKEND ${ENABLE_CUSPARSE_BACKEND}) configure_file(config.hpp.in "${CMAKE_CURRENT_BINARY_DIR}/oneapi/mkl/config.hpp.configured") file(GENERATE diff --git a/src/config.hpp.in b/src/config.hpp.in index de44cb16b..5d8b9a136 100644 --- a/src/config.hpp.in +++ b/src/config.hpp.in @@ -24,6 +24,7 @@ #cmakedefine ONEMKL_ENABLE_CUFFT_BACKEND #cmakedefine ONEMKL_ENABLE_CURAND_BACKEND #cmakedefine ONEMKL_ENABLE_CUSOLVER_BACKEND +#cmakedefine ONEMKL_ENABLE_CUSPARSE_BACKEND #cmakedefine ONEMKL_ENABLE_MKLCPU_BACKEND #cmakedefine ONEMKL_ENABLE_MKLGPU_BACKEND #cmakedefine ONEMKL_ENABLE_NETLIB_BACKEND diff --git a/src/sparse_blas/backends/CMakeLists.txt b/src/sparse_blas/backends/CMakeLists.txt index 294040808..baae9445d 100644 --- a/src/sparse_blas/backends/CMakeLists.txt +++ b/src/sparse_blas/backends/CMakeLists.txt @@ -27,3 +27,7 @@ endif() if(ENABLE_MKLGPU_BACKEND) add_subdirectory(mklgpu) endif() + +if(ENABLE_CUSPARSE_BACKEND) + add_subdirectory(cusparse) +endif() diff --git a/src/sparse_blas/backends/cusparse/CMakeLists.txt b/src/sparse_blas/backends/cusparse/CMakeLists.txt new file mode 100644 index 000000000..60bbaf35f --- /dev/null +++ b/src/sparse_blas/backends/cusparse/CMakeLists.txt @@ -0,0 +1,85 @@ +#=============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# +# SPDX-License-Identifier: Apache-2.0 +#=============================================================================== + +set(LIB_NAME onemkl_sparse_blas_cusparse) +set(LIB_OBJ ${LIB_NAME}_obj) + +include(WarningsUtils) + +add_library(${LIB_NAME}) +add_library(${LIB_OBJ} OBJECT + cusparse_handles.cpp + cusparse_scope_handle.cpp + operations/cusparse_spmm.cpp + operations/cusparse_spmv.cpp + operations/cusparse_spsv.cpp + $<$<BOOL:${BUILD_SHARED_LIBS}>: cusparse_wrappers.cpp> +) +add_dependencies(onemkl_backend_libs_sparse_blas ${LIB_NAME}) + +target_include_directories(${LIB_OBJ} + PRIVATE ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/src + ${CMAKE_BINARY_DIR}/bin + ${ONEMKL_GENERATED_INCLUDE_PATH} +) + +target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT}) + +if (${CMAKE_VERSION} VERSION_LESS "3.17.0") + find_package(CUDA 12.2 REQUIRED) + target_include_directories(${LIB_OBJ} PRIVATE ${CUDA_INCLUDE_DIRS}) + target_link_libraries(${LIB_OBJ} PUBLIC cuda rt ${CUDA_cusparse_LIBRARY}) +else() + find_package(CUDAToolkit 12.2 REQUIRED) + target_link_libraries(${LIB_OBJ} PRIVATE CUDA::cusparse CUDA::cudart CUDA::cuda_driver) +endif() + +target_link_libraries(${LIB_OBJ} + PUBLIC ONEMKL::SYCL::SYCL + PRIVATE onemkl_warnings +) + +set_target_properties(${LIB_OBJ} PROPERTIES + POSITION_INDEPENDENT_CODE ON +) +target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ}) + +#Set oneMKL libraries as not transitive for dynamic +if(BUILD_SHARED_LIBS) + set_target_properties(${LIB_NAME} PROPERTIES + INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL + ) +endif() + +# Add major version to the library +set_target_properties(${LIB_NAME} PROPERTIES + SOVERSION ${PROJECT_VERSION_MAJOR} +) + +# Add dependencies rpath to the library +list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>) + +# Add the library to install package +install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets) +install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets + RUNTIME DESTINATION bin + ARCHIVE DESTINATION lib + LIBRARY DESTINATION lib +) diff --git a/src/sparse_blas/backends/cusparse/cusparse_error.hpp b/src/sparse_blas/backends/cusparse/cusparse_error.hpp new file mode 100644 index 000000000..738888576 --- /dev/null +++ b/src/sparse_blas/backends/cusparse/cusparse_error.hpp @@ -0,0 +1,103 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_ERROR_HPP_ +#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_ERROR_HPP_ + +#include <string> + +#include <cuda.h> +#include <cusparse.h> + +#include "oneapi/mkl/exceptions.hpp" + +namespace oneapi::mkl::sparse::cusparse::detail { + +inline std::string cuda_result_to_str(CUresult result) { + switch (result) { +#define ONEMKL_CUSPARSE_CASE(STATUS) \ + case STATUS: return #STATUS + ONEMKL_CUSPARSE_CASE(CUDA_SUCCESS); + ONEMKL_CUSPARSE_CASE(CUDA_ERROR_NOT_PERMITTED); + ONEMKL_CUSPARSE_CASE(CUDA_ERROR_INVALID_CONTEXT); + ONEMKL_CUSPARSE_CASE(CUDA_ERROR_INVALID_DEVICE); + ONEMKL_CUSPARSE_CASE(CUDA_ERROR_INVALID_VALUE); + ONEMKL_CUSPARSE_CASE(CUDA_ERROR_OUT_OF_MEMORY); + ONEMKL_CUSPARSE_CASE(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES); + default: return "<unknown>"; + } +} + +#define CUDA_ERROR_FUNC(func, ...) \ + do { \ + auto res = func(__VA_ARGS__); \ + if (res != CUDA_SUCCESS) { \ + throw oneapi::mkl::exception("sparse_blas", #func, \ + "cuda error: " + detail::cuda_result_to_str(res)); \ + } \ + } while (0) + +inline std::string cusparse_status_to_str(cusparseStatus_t status) { + switch (status) { +#define ONEMKL_CUSPARSE_CASE(STATUS) \ + case STATUS: return #STATUS + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_SUCCESS); + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_NOT_INITIALIZED); + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_ALLOC_FAILED); + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_INVALID_VALUE); + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_ARCH_MISMATCH); + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_EXECUTION_FAILED); + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_INTERNAL_ERROR); + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED); + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_NOT_SUPPORTED); + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_INSUFFICIENT_RESOURCES); +#undef ONEMKL_CUSPARSE_CASE + default: return "<unknown>"; + } +} + +inline void check_status(cusparseStatus_t status, const std::string& function, + std::string error_str = "") { + if (status != CUSPARSE_STATUS_SUCCESS) { + if (!error_str.empty()) { + error_str += "; "; + } + error_str += "cuSPARSE status: " + cusparse_status_to_str(status); + switch (status) { + case CUSPARSE_STATUS_NOT_SUPPORTED: + throw oneapi::mkl::unimplemented("sparse_blas", function, error_str); + case CUSPARSE_STATUS_NOT_INITIALIZED: + throw oneapi::mkl::uninitialized("sparse_blas", function, error_str); + case CUSPARSE_STATUS_INVALID_VALUE: + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + throw oneapi::mkl::invalid_argument("sparse_blas", function, error_str); + default: throw oneapi::mkl::exception("sparse_blas", function, error_str); + } + } +} + +#define CUSPARSE_ERR_FUNC(func, ...) \ + do { \ + auto status = func(__VA_ARGS__); \ + detail::check_status(status, #func); \ + } while (0) + +} // namespace oneapi::mkl::sparse::cusparse::detail + +#endif // _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_ERROR_HPP_ diff --git a/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp b/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp new file mode 100644 index 000000000..179b007f5 --- /dev/null +++ b/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp @@ -0,0 +1,63 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_GLOBAL_HANDLE_HPP_ +#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_GLOBAL_HANDLE_HPP_ + +/** + * @file Similar to blas_handle.hpp + * Provides a map from a ur_context_handle_t (or equivalent) to a cusparseHandle_t. + * @see cusparse_scope_handle.hpp +*/ + +#include <atomic> +#include <unordered_map> + +namespace oneapi::mkl::sparse::cusparse::detail { + +template <typename T> +struct cusparse_global_handle { + using handle_container_t = std::unordered_map<T, std::atomic<cusparseHandle_t>*>; + handle_container_t cusparse_global_handle_mapper_{}; + + ~cusparse_global_handle() noexcept(false) { + for (auto& handle_pair : cusparse_global_handle_mapper_) { + if (handle_pair.second != nullptr) { + auto handle = handle_pair.second->exchange(nullptr); + if (handle != nullptr) { + CUSPARSE_ERR_FUNC(cusparseDestroy, handle); + handle = nullptr; + } + else { + // if the handle is nullptr it means the handle was already + // destroyed by the ContextCallback and we're free to delete the + // atomic object. + delete handle_pair.second; + } + + handle_pair.second = nullptr; + } + } + cusparse_global_handle_mapper_.clear(); + } +}; + +} // namespace oneapi::mkl::sparse::cusparse::detail + +#endif // _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_GLOBAL_HANDLE_HPP_ diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp new file mode 100644 index 000000000..ff3d8fcae --- /dev/null +++ b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp @@ -0,0 +1,485 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp" + +#include "cusparse_error.hpp" +#include "cusparse_helper.hpp" +#include "cusparse_handles.hpp" +#include "cusparse_task.hpp" +#include "sparse_blas/macros.hpp" + +namespace oneapi::mkl::sparse::cusparse { + +/** + * In this file CusparseScopedContextHandler are used to ensure that a cusparseHandle_t is created before any other cuSPARSE call, as required by the specification. +*/ + +// Dense vector +template <typename fpType> +void init_dense_vector(sycl::queue& queue, dense_vector_handle_t* p_dvhandle, std::int64_t size, + sycl::buffer<fpType, 1> val) { + auto event = queue.submit([&](sycl::handler& cgh) { + auto acc = val.template get_access<sycl::access::mode::read_write>(cgh); + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + detail::CusparseScopedContextHandler(queue, ih).get_handle(queue); + auto cuda_value_type = detail::CudaEnumType<fpType>::value; + cusparseDnVecDescr_t cu_dvhandle; + CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &cu_dvhandle, size, detail::get_mem(ih, acc), + cuda_value_type); + *p_dvhandle = new dense_vector_handle(cu_dvhandle, val, size); + }); + }); + event.wait_and_throw(); +} + +template <typename fpType> +void init_dense_vector(sycl::queue& queue, dense_vector_handle_t* p_dvhandle, std::int64_t size, + fpType* val) { + auto event = queue.submit([&](sycl::handler& cgh) { + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + detail::CusparseScopedContextHandler(queue, ih).get_handle(queue); + auto cuda_value_type = detail::CudaEnumType<fpType>::value; + cusparseDnVecDescr_t cu_dvhandle; + CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &cu_dvhandle, size, val, cuda_value_type); + *p_dvhandle = new dense_vector_handle(cu_dvhandle, val, size); + }); + }); + event.wait_and_throw(); +} + +template <typename fpType> +void set_dense_vector_data(sycl::queue& queue, dense_vector_handle_t dvhandle, std::int64_t size, + sycl::buffer<fpType, 1> val) { + detail::check_can_reset_value_handle<fpType>(__func__, dvhandle, true); + auto event = queue.submit([&](sycl::handler& cgh) { + auto acc = val.template get_access<sycl::access::mode::read_write>(cgh); + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + if (dvhandle->size != size) { + CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle); + auto cuda_value_type = detail::CudaEnumType<fpType>::value; + CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &dvhandle->backend_handle, size, + detail::get_mem(ih, acc), cuda_value_type); + dvhandle->size = size; + } + else { + CUSPARSE_ERR_FUNC(cusparseDnVecSetValues, dvhandle->backend_handle, + detail::get_mem(ih, acc)); + } + dvhandle->set_buffer(val); + }); + }); + event.wait_and_throw(); +} + +template <typename fpType> +void set_dense_vector_data(sycl::queue&, dense_vector_handle_t dvhandle, std::int64_t size, + fpType* val) { + detail::check_can_reset_value_handle<fpType>(__func__, dvhandle, false); + if (dvhandle->size != size) { + CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle); + auto cuda_value_type = detail::CudaEnumType<fpType>::value; + CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &dvhandle->backend_handle, size, val, + cuda_value_type); + dvhandle->size = size; + } + else { + CUSPARSE_ERR_FUNC(cusparseDnVecSetValues, dvhandle->backend_handle, val); + } + dvhandle->set_usm_ptr(val); +} + +FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_VECTOR_FUNCS); + +sycl::event release_dense_vector(sycl::queue& queue, dense_vector_handle_t dvhandle, + const std::vector<sycl::event>& dependencies) { + // Use dispatch_submit_impl_fp to ensure the backend's handle is kept alive as long as the buffer is used + auto functor = [=](sycl::interop_handle) { + CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle); + delete dvhandle; + }; + return detail::dispatch_submit_impl_fp(__func__, queue, dependencies, functor, dvhandle); +} + +// Dense matrix +template <typename fpType> +void init_dense_matrix(sycl::queue& queue, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t ld, layout dense_layout, + sycl::buffer<fpType, 1> val) { + auto event = queue.submit([&](sycl::handler& cgh) { + auto acc = val.template get_access<sycl::access::mode::read_write>(cgh); + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + detail::CusparseScopedContextHandler(queue, ih).get_handle(queue); + auto cuda_value_type = detail::CudaEnumType<fpType>::value; + auto cuda_order = detail::get_cuda_order(dense_layout); + cusparseDnMatDescr_t cu_dmhandle; + CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &cu_dmhandle, num_rows, num_cols, ld, + detail::get_mem(ih, acc), cuda_value_type, cuda_order); + *p_dmhandle = + new dense_matrix_handle(cu_dmhandle, val, num_rows, num_cols, ld, dense_layout); + }); + }); + event.wait_and_throw(); +} + +template <typename fpType> +void init_dense_matrix(sycl::queue& queue, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t ld, layout dense_layout, fpType* val) { + auto event = queue.submit([&](sycl::handler& cgh) { + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + detail::CusparseScopedContextHandler(queue, ih).get_handle(queue); + auto cuda_value_type = detail::CudaEnumType<fpType>::value; + auto cuda_order = detail::get_cuda_order(dense_layout); + cusparseDnMatDescr_t cu_dmhandle; + CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &cu_dmhandle, num_rows, num_cols, ld, val, + cuda_value_type, cuda_order); + *p_dmhandle = + new dense_matrix_handle(cu_dmhandle, val, num_rows, num_cols, ld, dense_layout); + }); + }); + event.wait_and_throw(); +} + +template <typename fpType> +void set_dense_matrix_data(sycl::queue& queue, dense_matrix_handle_t dmhandle, + std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, + oneapi::mkl::layout dense_layout, sycl::buffer<fpType, 1> val) { + detail::check_can_reset_value_handle<fpType>(__func__, dmhandle, true); + auto event = queue.submit([&](sycl::handler& cgh) { + auto acc = val.template get_access<sycl::access::mode::read_write>(cgh); + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols || + dmhandle->ld != ld || dmhandle->dense_layout != dense_layout) { + CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle); + auto cuda_value_type = detail::CudaEnumType<fpType>::value; + auto cuda_order = detail::get_cuda_order(dense_layout); + CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &dmhandle->backend_handle, num_rows, + num_cols, ld, detail::get_mem(ih, acc), cuda_value_type, + cuda_order); + dmhandle->num_rows = num_rows; + dmhandle->num_cols = num_cols; + dmhandle->ld = ld; + dmhandle->dense_layout = dense_layout; + } + else { + CUSPARSE_ERR_FUNC(cusparseDnMatSetValues, dmhandle->backend_handle, + detail::get_mem(ih, acc)); + } + dmhandle->set_buffer(val); + }); + }); + event.wait_and_throw(); +} + +template <typename fpType> +void set_dense_matrix_data(sycl::queue&, dense_matrix_handle_t dmhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout, + fpType* val) { + detail::check_can_reset_value_handle<fpType>(__func__, dmhandle, false); + if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols || dmhandle->ld != ld || + dmhandle->dense_layout != dense_layout) { + CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle); + auto cuda_value_type = detail::CudaEnumType<fpType>::value; + auto cuda_order = detail::get_cuda_order(dense_layout); + CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &dmhandle->backend_handle, num_rows, num_cols, ld, + val, cuda_value_type, cuda_order); + dmhandle->num_rows = num_rows; + dmhandle->num_cols = num_cols; + dmhandle->ld = ld; + dmhandle->dense_layout = dense_layout; + } + else { + CUSPARSE_ERR_FUNC(cusparseDnMatSetValues, dmhandle->backend_handle, val); + } + dmhandle->set_usm_ptr(val); +} + +FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_MATRIX_FUNCS); + +sycl::event release_dense_matrix(sycl::queue& queue, dense_matrix_handle_t dmhandle, + const std::vector<sycl::event>& dependencies) { + // Use dispatch_submit_impl_fp to ensure the backend's handle is kept alive as long as the buffer is used + auto functor = [=](sycl::interop_handle) { + CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle); + delete dmhandle; + }; + return detail::dispatch_submit_impl_fp(__func__, queue, dependencies, functor, dmhandle); +} + +// COO matrix +template <typename fpType, typename intType> +void init_coo_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, + sycl::buffer<intType, 1> row_ind, sycl::buffer<intType, 1> col_ind, + sycl::buffer<fpType, 1> val) { + auto event = queue.submit([&](sycl::handler& cgh) { + auto row_acc = row_ind.template get_access<sycl::access::mode::read_write>(cgh); + auto col_acc = col_ind.template get_access<sycl::access::mode::read_write>(cgh); + auto val_acc = val.template get_access<sycl::access::mode::read_write>(cgh); + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + detail::CusparseScopedContextHandler(queue, ih).get_handle(queue); + auto cuda_index_type = detail::CudaIndexEnumType<intType>::value; + auto cuda_index_base = detail::get_cuda_index_base(index); + auto cuda_value_type = detail::CudaEnumType<fpType>::value; + cusparseSpMatDescr_t cu_smhandle; + CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz, + detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc), + detail::get_mem(ih, val_acc), cuda_index_type, cuda_index_base, + cuda_value_type); + *p_smhandle = + new matrix_handle(cu_smhandle, row_ind, col_ind, val, detail::sparse_format::COO, + num_rows, num_cols, nnz, index); + }); + }); + event.wait_and_throw(); +} + +template <typename fpType, typename intType> +void init_coo_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, + intType* row_ind, intType* col_ind, fpType* val) { + auto event = queue.submit([&](sycl::handler& cgh) { + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + detail::CusparseScopedContextHandler(queue, ih).get_handle(queue); + auto cuda_index_type = detail::CudaIndexEnumType<intType>::value; + auto cuda_index_base = detail::get_cuda_index_base(index); + auto cuda_value_type = detail::CudaEnumType<fpType>::value; + cusparseSpMatDescr_t cu_smhandle; + CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz, row_ind, + col_ind, val, cuda_index_type, cuda_index_base, cuda_value_type); + *p_smhandle = + new matrix_handle(cu_smhandle, row_ind, col_ind, val, detail::sparse_format::COO, + num_rows, num_cols, nnz, index); + }); + }); + event.wait_and_throw(); +} + +template <typename fpType, typename intType> +void set_coo_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, + sycl::buffer<intType, 1> row_ind, sycl::buffer<intType, 1> col_ind, + sycl::buffer<fpType, 1> val) { + detail::check_can_reset_sparse_handle<fpType, intType>(__func__, smhandle, true); + auto event = queue.submit([&](sycl::handler& cgh) { + auto row_acc = row_ind.template get_access<sycl::access::mode::read_write>(cgh); + auto col_acc = col_ind.template get_access<sycl::access::mode::read_write>(cgh); + auto val_acc = val.template get_access<sycl::access::mode::read_write>(cgh); + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || + smhandle->nnz != nnz || smhandle->index != index) { + CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); + auto cuda_index_type = detail::CudaIndexEnumType<intType>::value; + auto cuda_index_base = detail::get_cuda_index_base(index); + auto cuda_value_type = detail::CudaEnumType<fpType>::value; + CUSPARSE_ERR_FUNC(cusparseCreateCoo, &smhandle->backend_handle, num_rows, num_cols, + nnz, detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc), + detail::get_mem(ih, val_acc), cuda_index_type, cuda_index_base, + cuda_value_type); + smhandle->num_rows = num_rows; + smhandle->num_cols = num_cols; + smhandle->nnz = nnz; + smhandle->index = index; + } + else { + CUSPARSE_ERR_FUNC(cusparseCooSetPointers, smhandle->backend_handle, + detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc), + detail::get_mem(ih, val_acc)); + } + smhandle->row_container.set_buffer(row_ind); + smhandle->col_container.set_buffer(col_ind); + smhandle->value_container.set_buffer(val); + }); + }); + event.wait_and_throw(); +} + +template <typename fpType, typename intType> +void set_coo_matrix_data(sycl::queue&, matrix_handle_t smhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, + intType* row_ind, intType* col_ind, fpType* val) { + detail::check_can_reset_sparse_handle<fpType, intType>(__func__, smhandle, false); + if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || smhandle->nnz != nnz || + smhandle->index != index) { + CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); + auto cuda_index_type = detail::CudaIndexEnumType<intType>::value; + auto cuda_index_base = detail::get_cuda_index_base(index); + auto cuda_value_type = detail::CudaEnumType<fpType>::value; + CUSPARSE_ERR_FUNC(cusparseCreateCoo, &smhandle->backend_handle, num_rows, num_cols, nnz, + row_ind, col_ind, val, cuda_index_type, cuda_index_base, cuda_value_type); + smhandle->num_rows = num_rows; + smhandle->num_cols = num_cols; + smhandle->nnz = nnz; + smhandle->index = index; + } + else { + CUSPARSE_ERR_FUNC(cusparseCooSetPointers, smhandle->backend_handle, row_ind, col_ind, val); + } + smhandle->row_container.set_usm_ptr(row_ind); + smhandle->col_container.set_usm_ptr(col_ind); + smhandle->value_container.set_usm_ptr(val); +} + +FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_COO_MATRIX_FUNCS); + +// CSR matrix +template <typename fpType, typename intType> +void init_csr_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, + sycl::buffer<intType, 1> row_ptr, sycl::buffer<intType, 1> col_ind, + sycl::buffer<fpType, 1> val) { + auto event = queue.submit([&](sycl::handler& cgh) { + auto row_acc = row_ptr.template get_access<sycl::access::mode::read_write>(cgh); + auto col_acc = col_ind.template get_access<sycl::access::mode::read_write>(cgh); + auto val_acc = val.template get_access<sycl::access::mode::read_write>(cgh); + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + detail::CusparseScopedContextHandler(queue, ih).get_handle(queue); + auto cuda_index_type = detail::CudaIndexEnumType<intType>::value; + auto cuda_index_base = detail::get_cuda_index_base(index); + auto cuda_value_type = detail::CudaEnumType<fpType>::value; + cusparseSpMatDescr_t cu_smhandle; + CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz, + detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc), + detail::get_mem(ih, val_acc), cuda_index_type, cuda_index_type, + cuda_index_base, cuda_value_type); + *p_smhandle = + new matrix_handle(cu_smhandle, row_ptr, col_ind, val, detail::sparse_format::CSR, + num_rows, num_cols, nnz, index); + }); + }); + event.wait_and_throw(); +} + +template <typename fpType, typename intType> +void init_csr_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, + intType* row_ptr, intType* col_ind, fpType* val) { + auto event = queue.submit([&](sycl::handler& cgh) { + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + detail::CusparseScopedContextHandler(queue, ih).get_handle(queue); + auto cuda_index_type = detail::CudaIndexEnumType<intType>::value; + auto cuda_index_base = detail::get_cuda_index_base(index); + auto cuda_value_type = detail::CudaEnumType<fpType>::value; + cusparseSpMatDescr_t cu_smhandle; + CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz, row_ptr, + col_ind, val, cuda_index_type, cuda_index_type, cuda_index_base, + cuda_value_type); + *p_smhandle = + new matrix_handle(cu_smhandle, row_ptr, col_ind, val, detail::sparse_format::CSR, + num_rows, num_cols, nnz, index); + }); + }); + event.wait_and_throw(); +} + +template <typename fpType, typename intType> +void set_csr_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, + sycl::buffer<intType, 1> row_ptr, sycl::buffer<intType, 1> col_ind, + sycl::buffer<fpType, 1> val) { + detail::check_can_reset_sparse_handle<fpType, intType>(__func__, smhandle, true); + auto event = queue.submit([&](sycl::handler& cgh) { + auto row_acc = row_ptr.template get_access<sycl::access::mode::read_write>(cgh); + auto col_acc = col_ind.template get_access<sycl::access::mode::read_write>(cgh); + auto val_acc = val.template get_access<sycl::access::mode::read_write>(cgh); + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || + smhandle->nnz != nnz || smhandle->index != index) { + CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); + auto cuda_index_type = detail::CudaIndexEnumType<intType>::value; + auto cuda_index_base = detail::get_cuda_index_base(index); + auto cuda_value_type = detail::CudaEnumType<fpType>::value; + CUSPARSE_ERR_FUNC(cusparseCreateCsr, &smhandle->backend_handle, num_rows, num_cols, + nnz, detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc), + detail::get_mem(ih, val_acc), cuda_index_type, cuda_index_type, + cuda_index_base, cuda_value_type); + smhandle->num_rows = num_rows; + smhandle->num_cols = num_cols; + smhandle->nnz = nnz; + smhandle->index = index; + } + else { + CUSPARSE_ERR_FUNC(cusparseCsrSetPointers, smhandle->backend_handle, + detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc), + detail::get_mem(ih, val_acc)); + } + smhandle->row_container.set_buffer(row_ptr); + smhandle->col_container.set_buffer(col_ind); + smhandle->value_container.set_buffer(val); + }); + }); + event.wait_and_throw(); +} + +template <typename fpType, typename intType> +void set_csr_matrix_data(sycl::queue&, matrix_handle_t smhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, + intType* row_ptr, intType* col_ind, fpType* val) { + detail::check_can_reset_sparse_handle<fpType, intType>(__func__, smhandle, false); + if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || smhandle->nnz != nnz || + smhandle->index != index) { + CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); + auto cuda_index_type = detail::CudaIndexEnumType<intType>::value; + auto cuda_index_base = detail::get_cuda_index_base(index); + auto cuda_value_type = detail::CudaEnumType<fpType>::value; + CUSPARSE_ERR_FUNC(cusparseCreateCsr, &smhandle->backend_handle, num_rows, num_cols, nnz, + row_ptr, col_ind, val, cuda_index_type, cuda_index_type, cuda_index_base, + cuda_value_type); + smhandle->num_rows = num_rows; + smhandle->num_cols = num_cols; + smhandle->nnz = nnz; + smhandle->index = index; + } + else { + CUSPARSE_ERR_FUNC(cusparseCsrSetPointers, smhandle->backend_handle, row_ptr, col_ind, val); + } + smhandle->row_container.set_usm_ptr(row_ptr); + smhandle->col_container.set_usm_ptr(col_ind); + smhandle->value_container.set_usm_ptr(val); +} + +FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_CSR_MATRIX_FUNCS); + +sycl::event release_sparse_matrix(sycl::queue& queue, matrix_handle_t smhandle, + const std::vector<sycl::event>& dependencies) { + // Use dispatch_submit to ensure the backend's handle is kept alive as long as the buffers are used + auto functor = [=](sycl::interop_handle) { + CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); + delete smhandle; + }; + return detail::dispatch_submit(__func__, queue, dependencies, functor, smhandle); +} + +// Matrix property +bool set_matrix_property(sycl::queue&, matrix_handle_t smhandle, matrix_property property) { + // No equivalent in cuSPARSE + // Store the matrix property internally for future usages + smhandle->set_matrix_property(property); + return false; +} + +} // namespace oneapi::mkl::sparse::cusparse diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.hpp b/src/sparse_blas/backends/cusparse/cusparse_handles.hpp new file mode 100644 index 000000000..5e5bdc732 --- /dev/null +++ b/src/sparse_blas/backends/cusparse/cusparse_handles.hpp @@ -0,0 +1,95 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#ifndef _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_CUSPARSE_HANDLES_HPP_ +#define _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_CUSPARSE_HANDLES_HPP_ + +#include <cusparse.h> + +#include "sparse_blas/generic_container.hpp" + +namespace oneapi::mkl::sparse { + +// Complete the definition of incomplete types dense_vector_handle, dense_matrix_handle and matrix_handle. + +struct dense_vector_handle : public detail::generic_dense_vector_handle<cusparseDnVecDescr_t> { + template <typename T> + dense_vector_handle(cusparseDnVecDescr_t cu_descr, T* value_ptr, std::int64_t size) + : detail::generic_dense_vector_handle<cusparseDnVecDescr_t>(cu_descr, value_ptr, size) { + } + + template <typename T> + dense_vector_handle(cusparseDnVecDescr_t cu_descr, const sycl::buffer<T, 1> value_buffer, + std::int64_t size) + : detail::generic_dense_vector_handle<cusparseDnVecDescr_t>(cu_descr, value_buffer, + size) {} +}; + +struct dense_matrix_handle : public detail::generic_dense_matrix_handle<cusparseDnMatDescr_t> { + template <typename T> + dense_matrix_handle(cusparseDnMatDescr_t cu_descr, T* value_ptr, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t ld, layout dense_layout) + : detail::generic_dense_matrix_handle<cusparseDnMatDescr_t>( + cu_descr, value_ptr, num_rows, num_cols, ld, dense_layout) {} + + template <typename T> + dense_matrix_handle(cusparseDnMatDescr_t cu_descr, const sycl::buffer<T, 1> value_buffer, + std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, + layout dense_layout) + : detail::generic_dense_matrix_handle<cusparseDnMatDescr_t>( + cu_descr, value_buffer, num_rows, num_cols, ld, dense_layout) {} +}; + +struct matrix_handle : public detail::generic_sparse_handle<cusparseSpMatDescr_t> { + template <typename fpType, typename intType> + matrix_handle(cusparseSpMatDescr_t cu_descr, intType* row_ptr, intType* col_ptr, + fpType* value_ptr, detail::sparse_format format, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index) + : detail::generic_sparse_handle<cusparseSpMatDescr_t>( + cu_descr, row_ptr, col_ptr, value_ptr, format, num_rows, num_cols, nnz, index) {} + + template <typename fpType, typename intType> + matrix_handle(cusparseSpMatDescr_t cu_descr, const sycl::buffer<intType, 1> row_buffer, + const sycl::buffer<intType, 1> col_buffer, + const sycl::buffer<fpType, 1> value_buffer, detail::sparse_format format, + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, + oneapi::mkl::index_base index) + : detail::generic_sparse_handle<cusparseSpMatDescr_t>(cu_descr, row_buffer, col_buffer, + value_buffer, format, num_rows, + num_cols, nnz, index) {} +}; + +namespace detail { + +inline void check_valid_matrix_properties(const std::string& function_name, + matrix_handle_t sm_handle) { + if (sm_handle->format == sparse_format::COO && + !(sm_handle->has_matrix_property(matrix_property::sorted_by_rows) || + sm_handle->has_matrix_property(matrix_property::sorted))) { + throw mkl::unimplemented( + "sparse_blas", function_name, + "The backend does not support unsorted COO format. Use `set_matrix_property` to set the property `matrix_property::sorted_by_rows` or `matrix_property::sorted`"); + } +} + +} // namespace detail + +} // namespace oneapi::mkl::sparse + +#endif // _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_CUSPARSE_HANDLES_HPP_ diff --git a/src/sparse_blas/backends/cusparse/cusparse_helper.hpp b/src/sparse_blas/backends/cusparse/cusparse_helper.hpp new file mode 100644 index 000000000..3feb4bcad --- /dev/null +++ b/src/sparse_blas/backends/cusparse/cusparse_helper.hpp @@ -0,0 +1,166 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ +#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_HELPER_HPP_ +#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_HELPER_HPP_ + +#include <complex> +#include <cstdint> +#include <limits> +#include <string> + +#include <cusparse.h> + +#include "oneapi/mkl/sparse_blas/types.hpp" +#include "sparse_blas/enum_data_types.hpp" +#include "sparse_blas/sycl_helper.hpp" +#include "cusparse_error.hpp" + +namespace oneapi::mkl::sparse::cusparse::detail { + +using namespace oneapi::mkl::sparse::detail; + +template <typename T> +struct CudaEnumType; +template <> +struct CudaEnumType<float> { + static constexpr cudaDataType_t value = CUDA_R_32F; +}; +template <> +struct CudaEnumType<double> { + static constexpr cudaDataType_t value = CUDA_R_64F; +}; +template <> +struct CudaEnumType<std::complex<float>> { + static constexpr cudaDataType_t value = CUDA_C_32F; +}; +template <> +struct CudaEnumType<std::complex<double>> { + static constexpr cudaDataType_t value = CUDA_C_64F; +}; + +template <typename T> +struct CudaIndexEnumType; +template <> +struct CudaIndexEnumType<std::int32_t> { + static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I; +}; +template <> +struct CudaIndexEnumType<std::int64_t> { + static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_64I; +}; + +template <typename E> +inline std::string cast_enum_to_str(E e) { + return std::to_string(static_cast<char>(e)); +} + +inline cudaDataType_t get_cuda_value_type(data_type onemkl_data_type) { + switch (onemkl_data_type) { + case data_type::real_fp32: return CUDA_R_32F; + case data_type::real_fp64: return CUDA_R_64F; + case data_type::complex_fp32: return CUDA_C_32F; + case data_type::complex_fp64: return CUDA_C_64F; + default: + throw oneapi::mkl::invalid_argument( + "sparse_blas", "get_cuda_value_type", + "Invalid data type: " + cast_enum_to_str(onemkl_data_type)); + } +} + +inline cusparseOrder_t get_cuda_order(layout l) { + switch (l) { + case layout::row_major: return CUSPARSE_ORDER_ROW; + case layout::col_major: return CUSPARSE_ORDER_COL; + default: + throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_order", + "Unknown layout: " + cast_enum_to_str(l)); + } +} + +inline cusparseIndexBase_t get_cuda_index_base(index_base index) { + switch (index) { + case index_base::zero: return CUSPARSE_INDEX_BASE_ZERO; + case index_base::one: return CUSPARSE_INDEX_BASE_ONE; + default: + throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_index_base", + "Unknown index_base: " + cast_enum_to_str(index)); + } +} + +/// Return the CUDA transpose operation from a oneMKL type. +/// Do not conjugate for real types to avoid an invalid argument. +inline cusparseOperation_t get_cuda_operation(data_type type, transpose op) { + switch (op) { + case transpose::nontrans: return CUSPARSE_OPERATION_NON_TRANSPOSE; + case transpose::trans: return CUSPARSE_OPERATION_TRANSPOSE; + case transpose::conjtrans: + return (type == data_type::complex_fp32 || type == data_type::complex_fp64) + ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE + : CUSPARSE_OPERATION_TRANSPOSE; + default: + throw oneapi::mkl::invalid_argument( + "sparse_blas", "get_cuda_operation", + "Unknown transpose operation: " + cast_enum_to_str(op)); + } +} + +inline auto get_cuda_uplo(uplo uplo_val) { + switch (uplo_val) { + case uplo::upper: return CUSPARSE_FILL_MODE_UPPER; + case uplo::lower: return CUSPARSE_FILL_MODE_LOWER; + default: + throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_uplo", + "Unknown uplo: " + cast_enum_to_str(uplo_val)); + } +} + +inline auto get_cuda_diag(diag diag_val) { + switch (diag_val) { + case diag::nonunit: return CUSPARSE_DIAG_TYPE_NON_UNIT; + case diag::unit: return CUSPARSE_DIAG_TYPE_UNIT; + default: + throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_diag", + "Unknown diag: " + cast_enum_to_str(diag_val)); + } +} + +inline void set_matrix_attributes(const std::string& func_name, cusparseSpMatDescr_t cu_a, + oneapi::mkl::sparse::matrix_view A_view) { + auto cu_fill_mode = get_cuda_uplo(A_view.uplo_view); + auto status = cusparseSpMatSetAttribute(cu_a, CUSPARSE_SPMAT_FILL_MODE, &cu_fill_mode, + sizeof(cu_fill_mode)); + check_status(status, func_name + "/set_uplo"); + + auto cu_diag_type = get_cuda_diag(A_view.diag_view); + status = cusparseSpMatSetAttribute(cu_a, CUSPARSE_SPMAT_DIAG_TYPE, &cu_diag_type, + sizeof(cu_diag_type)); + check_status(status, func_name + "/set_diag"); +} + +/** + * cuSPARSE requires to set the pointer mode for scalars parameters (typically alpha and beta). + */ +inline void set_pointer_mode(cusparseHandle_t cu_handle, bool is_ptr_host_accessible) { + cusparseSetPointerMode(cu_handle, is_ptr_host_accessible ? CUSPARSE_POINTER_MODE_HOST + : CUSPARSE_POINTER_MODE_DEVICE); +} + +} // namespace oneapi::mkl::sparse::cusparse::detail + +#endif //_ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_HELPER_HPP_ diff --git a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp new file mode 100644 index 000000000..4d92daf35 --- /dev/null +++ b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp @@ -0,0 +1,147 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +/** + * @file Similar to cublas_scope_handle.cpp +*/ + +#include "cusparse_scope_handle.hpp" + +namespace oneapi::mkl::sparse::cusparse::detail { + +/** + * Inserts a new element in the map if its key is unique. This new element + * is constructed in place using args as the arguments for the construction + * of a value_type (which is an object of a pair type). The insertion only + * takes place if no other element in the container has a key equivalent to + * the one being emplaced (keys in a map container are unique). + */ +#ifdef ONEAPI_ONEMKL_PI_INTERFACE_REMOVED +thread_local cusparse_global_handle<ur_context_handle_t> + CusparseScopedContextHandler::handle_helper = cusparse_global_handle<ur_context_handle_t>{}; +#else +thread_local cusparse_global_handle<pi_context> CusparseScopedContextHandler::handle_helper = + cusparse_global_handle<pi_context>{}; +#endif + +CusparseScopedContextHandler::CusparseScopedContextHandler(sycl::queue queue, + sycl::interop_handle& ih) + : ih(ih), + needToRecover_(false) { + placedContext_ = new sycl::context(queue.get_context()); + auto cudaDevice = ih.get_native_device<sycl::backend::ext_oneapi_cuda>(); + CUcontext desired; + CUDA_ERROR_FUNC(cuCtxGetCurrent, &original_); + CUDA_ERROR_FUNC(cuDevicePrimaryCtxRetain, &desired, cudaDevice); + if (original_ != desired) { + // Sets the desired context as the active one for the thread + CUDA_ERROR_FUNC(cuCtxSetCurrent, desired); + // No context is installed and the suggested context is primary + // This is the most common case. We can activate the context in the + // thread and leave it there until all the PI context referring to the + // same underlying CUDA primary context are destroyed. This emulates + // the behaviour of the CUDA runtime api, and avoids costly context + // switches. No action is required on this side of the if. + needToRecover_ = !(original_ == nullptr); + } +} + +CusparseScopedContextHandler::~CusparseScopedContextHandler() noexcept(false) { + if (needToRecover_) { + CUDA_ERROR_FUNC(cuCtxSetCurrent, original_); + } + delete placedContext_; +} + +void ContextCallback(void* userData) { + auto* ptr = static_cast<std::atomic<cusparseHandle_t>*>(userData); + if (!ptr) { + return; + } + auto handle = ptr->exchange(nullptr); + if (handle != nullptr) { + CUSPARSE_ERR_FUNC(cusparseDestroy, handle); + handle = nullptr; + } + else { + // if the handle is nullptr it means the handle was already destroyed by + // the cusparse_global_handle destructor and we're free to delete the atomic + // object. + delete ptr; + } +} + +std::pair<cusparseHandle_t, CUstream> CusparseScopedContextHandler::get_handle_and_stream( + const sycl::queue& queue) { + auto cudaDevice = ih.get_native_device<sycl::backend::ext_oneapi_cuda>(); + CUcontext desired; + CUDA_ERROR_FUNC(cuDevicePrimaryCtxRetain, &desired, cudaDevice); +#ifdef ONEAPI_ONEMKL_PI_INTERFACE_REMOVED + auto piPlacedContext_ = reinterpret_cast<ur_context_handle_t>(desired); +#else + auto piPlacedContext_ = reinterpret_cast<pi_context>(desired); +#endif + CUstream streamId = get_stream(queue); + auto it = handle_helper.cusparse_global_handle_mapper_.find(piPlacedContext_); + if (it != handle_helper.cusparse_global_handle_mapper_.end()) { + if (it->second == nullptr) { + handle_helper.cusparse_global_handle_mapper_.erase(it); + } + else { + auto handle = it->second->load(); + if (handle != nullptr) { + cudaStream_t currentStreamId; + CUSPARSE_ERR_FUNC(cusparseGetStream, handle, ¤tStreamId); + if (currentStreamId != streamId) { + CUSPARSE_ERR_FUNC(cusparseSetStream, handle, streamId); + } + return { handle, streamId }; + } + else { + handle_helper.cusparse_global_handle_mapper_.erase(it); + } + } + } + + cusparseHandle_t handle; + CUSPARSE_ERR_FUNC(cusparseCreate, &handle); + CUSPARSE_ERR_FUNC(cusparseSetStream, handle, streamId); + + auto insert_iter = handle_helper.cusparse_global_handle_mapper_.insert( + std::make_pair(piPlacedContext_, new std::atomic<cusparseHandle_t>(handle))); + + sycl::detail::pi::contextSetExtendedDeleter(*placedContext_, ContextCallback, + insert_iter.first->second); + + return { handle, streamId }; +} + +cusparseHandle_t CusparseScopedContextHandler::get_handle(const sycl::queue& queue) { + return get_handle_and_stream(queue).first; +} + +CUstream CusparseScopedContextHandler::get_stream(const sycl::queue& queue) { + return sycl::get_native<sycl::backend::ext_oneapi_cuda>(queue); +} + +sycl::context CusparseScopedContextHandler::get_context(const sycl::queue& queue) { + return queue.get_context(); +} + +} // namespace oneapi::mkl::sparse::cusparse::detail diff --git a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp new file mode 100644 index 000000000..7b8313ee6 --- /dev/null +++ b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp @@ -0,0 +1,88 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ +#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_SCOPE_HANDLE_HPP_ +#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_SCOPE_HANDLE_HPP_ + +/** + * @file Similar to cublas_scope_handle.hpp +*/ + +#if __has_include(<sycl/sycl.hpp>) +#include <sycl/sycl.hpp> +#else +#include <CL/sycl.hpp> +#endif + +// After Plugin Interface removal in DPC++ ur.hpp is the new include +#if __has_include(<sycl/detail/ur.hpp>) && !defined(ONEAPI_ONEMKL_PI_INTERFACE_REMOVED) +#define ONEAPI_ONEMKL_PI_INTERFACE_REMOVED +#endif + +#include <thread> + +#include "cusparse_error.hpp" +#include "cusparse_global_handle.hpp" +#include "cusparse_helper.hpp" + +namespace oneapi::mkl::sparse::cusparse::detail { + +class CusparseScopedContextHandler { + CUcontext original_; + sycl::context* placedContext_; + sycl::interop_handle& ih; + bool needToRecover_; + +#ifdef ONEAPI_ONEMKL_PI_INTERFACE_REMOVED + static thread_local cusparse_global_handle<ur_context_handle_t> handle_helper; +#else + static thread_local cusparse_global_handle<pi_context> handle_helper; +#endif + + CUstream get_stream(const sycl::queue& queue); + sycl::context get_context(const sycl::queue& queue); + +public: + CusparseScopedContextHandler(sycl::queue queue, sycl::interop_handle& ih); + + ~CusparseScopedContextHandler() noexcept(false); + + /** + * @brief get_handle: creates the handle by implicitly impose the advice + * given by nvidia for creating a cusparse_global_handle. (e.g. one cuStream per device + * per thread). + * @param queue sycl queue. + * @return a pair of: cusparseHandle_t a handle to construct cusparse routines; and a CUDA stream + */ + std::pair<cusparseHandle_t, CUstream> get_handle_and_stream(const sycl::queue& queue); + + /// See get_handle_and_stream + cusparseHandle_t get_handle(const sycl::queue& queue); +}; + +// Get the native pointer from an accessor. This is a different pointer than +// what can be retrieved with get_multi_ptr. +template <typename AccT> +inline void* get_mem(sycl::interop_handle ih, AccT acc) { + auto cudaPtr = ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(acc); + return reinterpret_cast<void*>(cudaPtr); +} + +} // namespace oneapi::mkl::sparse::cusparse::detail + +#endif //_ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_SCOPE_HANDLE_HPP_ diff --git a/src/sparse_blas/backends/cusparse/cusparse_task.hpp b/src/sparse_blas/backends/cusparse/cusparse_task.hpp new file mode 100644 index 000000000..0d86d642d --- /dev/null +++ b/src/sparse_blas/backends/cusparse/cusparse_task.hpp @@ -0,0 +1,431 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_TASKS_HPP_ +#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_TASKS_HPP_ + +#include "cusparse_handles.hpp" +#include "cusparse_scope_handle.hpp" + +/// This file provide a helper function to submit host_task using buffers or USM seamlessly + +namespace oneapi::mkl::sparse::cusparse::detail { + +template <typename T, typename Container> +auto get_value_accessor(sycl::handler& cgh, Container container) { + auto buffer_ptr = + reinterpret_cast<sycl::buffer<T, 1>*>(container->value_container.buffer_ptr.get()); + return buffer_ptr->template get_access<sycl::access::mode::read_write>(cgh); +} + +template <typename T, typename... Ts> +auto get_fp_accessors(sycl::handler& cgh, Ts... containers) { + return std::array<sycl::accessor<T, 1>, sizeof...(containers)>{ get_value_accessor<T>( + cgh, containers)... }; +} + +template <typename T> +auto get_row_accessor(sycl::handler& cgh, matrix_handle_t smhandle) { + auto buffer_ptr = + reinterpret_cast<sycl::buffer<T, 1>*>(smhandle->row_container.buffer_ptr.get()); + return buffer_ptr->template get_access<sycl::access::mode::read_write>(cgh); +} + +template <typename T> +auto get_col_accessor(sycl::handler& cgh, matrix_handle_t smhandle) { + auto buffer_ptr = + reinterpret_cast<sycl::buffer<T, 1>*>(smhandle->col_container.buffer_ptr.get()); + return buffer_ptr->template get_access<sycl::access::mode::read_write>(cgh); +} + +template <typename T> +auto get_int_accessors(sycl::handler& cgh, matrix_handle_t smhandle) { + return std::array<sycl::accessor<T, 1>, 2>{ get_row_accessor<T>(cgh, smhandle), + get_col_accessor<T>(cgh, smhandle) }; +} + +template <typename Functor, typename... CaptureOnlyAcc> +void submit_host_task(sycl::handler& cgh, sycl::queue& queue, Functor functor, + CaptureOnlyAcc... capture_only_accessors) { + // Only capture the accessors to ensure the dependencies are properly + // handled. The accessors's pointer have already been set to the native + // container types in previous functions. This assumes the underlying + // pointer of the buffer does not change. This is not guaranteed by the SYCL + // specification but should be true for all the implementations. This + // assumption avoids the overhead of resetting the pointer of all data + // handles for each enqueued command. + cgh.host_task([functor, queue, capture_only_accessors...](sycl::interop_handle ih) { + auto unused = std::make_tuple(capture_only_accessors...); + (void)unused; + functor(ih); + }); +} + +template <typename Functor, typename... CaptureOnlyAcc> +void submit_host_task_with_acc(sycl::handler& cgh, sycl::queue& queue, Functor functor, + sycl::accessor<std::uint8_t> workspace_acc, + CaptureOnlyAcc... capture_only_accessors) { + // Only capture the accessors to ensure the dependencies are properly + // handled. The accessors's pointer have already been set to the native + // container types in previous functions. This assumes the underlying + // pointer of the buffer does not change. This is not guaranteed by the SYCL + // specification but should be true for all the implementations. This + // assumption avoids the overhead of resetting the pointer of all data + // handles for each enqueued command. + cgh.host_task( + [functor, queue, workspace_acc, capture_only_accessors...](sycl::interop_handle ih) { + auto unused = std::make_tuple(capture_only_accessors...); + (void)unused; + functor(ih, workspace_acc); + }); +} + +template <typename Functor, typename... CaptureOnlyAcc> +void submit_native_command_ext(sycl::handler& cgh, sycl::queue& queue, Functor functor, + const std::vector<sycl::event>& dependencies, + CaptureOnlyAcc... capture_only_accessors) { + // Only capture the accessors to ensure the dependencies are properly + // handled. The accessors's pointer have already been set to the native + // container types in previous functions. This assumes the underlying + // pointer of the buffer does not change. This is not guaranteed by the SYCL + // specification but should be true for all the implementations. This + // assumption avoids the overhead of resetting the pointer of all data + // handles for each enqueued command. +#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND + cgh.ext_codeplay_enqueue_native_command( + [functor, queue, dependencies, capture_only_accessors...](sycl::interop_handle ih) { + auto unused = std::make_tuple(capture_only_accessors...); + (void)unused; + // The functor using ext_codeplay_enqueue_native_command need to + // explicitly wait on the events for the SPARSE domain. The + // extension ext_codeplay_enqueue_native_command is used to launch + // the compute operation which depends on the previous optimize + // step. In cuSPARSE the optimize step is synchronous but it is + // asynchronous in oneMKL Interface. The optimize step may not use + // the CUDA stream which would make it impossible for + // ext_codeplay_enqueue_native_command to automatically ensure it + // has completed before the compute function starts. These waits are + // used to ensure the optimize step has completed before starting + // the computation. + for (auto event : dependencies) { + event.wait(); + } + functor(ih); + }); +#else + (void)dependencies; + submit_host_task(cgh, queue, functor, capture_only_accessors...); +#endif +} + +template <typename Functor, typename... CaptureOnlyAcc> +void submit_native_command_ext_with_acc(sycl::handler& cgh, sycl::queue& queue, Functor functor, + const std::vector<sycl::event>& dependencies, + sycl::accessor<std::uint8_t> workspace_acc, + CaptureOnlyAcc... capture_only_accessors) { + // Only capture the accessors to ensure the dependencies are properly + // handled. The accessors's pointer have already been set to the native + // container types in previous functions. This assumes the underlying + // pointer of the buffer does not change. This is not guaranteed by the SYCL + // specification but should be true for all the implementations. This + // assumption avoids the overhead of resetting the pointer of all data + // handles for each enqueued command. +#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND + cgh.ext_codeplay_enqueue_native_command([functor, queue, dependencies, workspace_acc, + capture_only_accessors...](sycl::interop_handle ih) { + auto unused = std::make_tuple(capture_only_accessors...); + (void)unused; + // The functor using ext_codeplay_enqueue_native_command need to + // explicitly wait on the events for the SPARSE domain. The + // extension ext_codeplay_enqueue_native_command is used to launch + // the compute operation which depends on the previous optimize + // step. In cuSPARSE the optimize step is synchronous but it is + // asynchronous in oneMKL Interface. The optimize step may not use + // the CUDA stream which would make it impossible for + // ext_codeplay_enqueue_native_command to automatically ensure it + // has completed before the compute function starts. These waits are + // used to ensure the optimize step has completed before starting + // the computation. + for (auto event : dependencies) { + event.wait(); + } + functor(ih, workspace_acc); + }); +#else + (void)dependencies; + submit_host_task_with_acc(cgh, queue, functor, workspace_acc, capture_only_accessors...); +#endif +} + +/// Helper submit functions to capture all accessors from the generic containers +/// \p other_containers and ensure the dependencies of buffers are respected. +/// The accessors are not directly used as the underlying data pointer has +/// already been captured in previous functions. +/// \p workspace_buffer is an optional buffer. Its accessor will be given to the +/// functor as a last argument if \p UseWorkspace is true. +/// \p UseWorkspace must be true to use the given \p workspace_buffer. +/// \p UseEnqueueNativeCommandExt controls whether host_task are used or the +/// extension ext_codeplay_enqueue_native_command is used to launch tasks. The +/// extension should only be used for asynchronous functions using native +/// backend's functions. The extension can only be used for in-order queues as +/// the same cuStream needs to be used for the 3 steps to run an operation: +/// querying the buffer size, optimizing and running the computation. This means +/// a different cuStream can be used inside the native_command than the native +/// cuStream used by the extension. +template <bool UseWorkspace, bool UseEnqueueNativeCommandExt, typename Functor, typename... Ts> +sycl::event dispatch_submit_impl_fp_int(const std::string& function_name, sycl::queue queue, + const std::vector<sycl::event>& dependencies, + Functor functor, matrix_handle_t sm_handle, + sycl::buffer<std::uint8_t> workspace_buffer, + Ts... other_containers) { + bool is_in_order_queue = queue.is_in_order(); + if (sm_handle->all_use_buffer()) { + data_type value_type = sm_handle->get_value_type(); + data_type int_type = sm_handle->get_int_type(); + +#define ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, INT_TYPE) \ + return queue.submit([&](sycl::handler& cgh) { \ + cgh.depends_on(dependencies); \ + auto fp_accs = get_fp_accessors<FP_TYPE>(cgh, sm_handle, other_containers...); \ + auto int_accs = get_int_accessors<INT_TYPE>(cgh, sm_handle); \ + auto workspace_acc = workspace_buffer.get_access<sycl::access::mode::read_write>(cgh); \ + if constexpr (UseWorkspace) { \ + if constexpr (UseEnqueueNativeCommandExt) { \ + if (is_in_order_queue) { \ + submit_native_command_ext_with_acc(cgh, queue, functor, dependencies, \ + workspace_acc, fp_accs, int_accs); \ + } \ + else { \ + submit_host_task_with_acc(cgh, queue, functor, workspace_acc, fp_accs, \ + int_accs); \ + } \ + } \ + else { \ + submit_host_task_with_acc(cgh, queue, functor, workspace_acc, fp_accs, int_accs); \ + } \ + } \ + else { \ + (void)workspace_buffer; \ + if constexpr (UseEnqueueNativeCommandExt) { \ + if (is_in_order_queue) { \ + submit_native_command_ext(cgh, queue, functor, dependencies, fp_accs, \ + int_accs); \ + } \ + else { \ + submit_host_task(cgh, queue, functor, fp_accs, int_accs); \ + } \ + } \ + else { \ + submit_host_task(cgh, queue, functor, fp_accs, int_accs); \ + } \ + } \ + }) +#define ONEMKL_CUSPARSE_SUBMIT_INT(FP_TYPE) \ + if (int_type == data_type::int32) { \ + ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, std::int32_t); \ + } \ + else if (int_type == data_type::int64) { \ + ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, std::int64_t); \ + } + + if (value_type == data_type::real_fp32) { + ONEMKL_CUSPARSE_SUBMIT_INT(float) + } + else if (value_type == data_type::real_fp64) { + ONEMKL_CUSPARSE_SUBMIT_INT(double) + } + else if (value_type == data_type::complex_fp32) { + ONEMKL_CUSPARSE_SUBMIT_INT(std::complex<float>) + } + else if (value_type == data_type::complex_fp64) { + ONEMKL_CUSPARSE_SUBMIT_INT(std::complex<double>) + } + +#undef ONEMKL_CUSPARSE_SUBMIT_INT +#undef ONEMKL_CUSPARSE_SUBMIT + + throw oneapi::mkl::exception("sparse_blas", function_name, + "Could not dispatch buffer kernel to a supported type"); + } + else { + // USM submit does not need to capture accessors + if constexpr (!UseWorkspace) { + return queue.submit([&](sycl::handler& cgh) { + cgh.depends_on(dependencies); + if constexpr (UseEnqueueNativeCommandExt) { + if (is_in_order_queue) { + submit_native_command_ext(cgh, queue, functor, dependencies); + } + else { + submit_host_task(cgh, queue, functor); + } + } + else { + submit_host_task(cgh, queue, functor); + } + }); + } + else { + throw oneapi::mkl::exception("sparse_blas", function_name, + "Internal error: Cannot use accessor workspace with USM"); + } + } +} + +/// Similar to dispatch_submit_impl_fp_int but only dispatches the host_task based on the floating point value type. +template <typename Functor, typename ContainerT> +sycl::event dispatch_submit_impl_fp(const std::string& function_name, sycl::queue queue, + const std::vector<sycl::event>& dependencies, Functor functor, + ContainerT container_handle) { + if (container_handle->all_use_buffer()) { + data_type value_type = container_handle->get_value_type(); + +#define ONEMKL_CUSPARSE_SUBMIT(FP_TYPE) \ + return queue.submit([&](sycl::handler& cgh) { \ + cgh.depends_on(dependencies); \ + auto fp_accs = get_fp_accessors<FP_TYPE>(cgh, container_handle); \ + submit_host_task(cgh, queue, functor, fp_accs); \ + }) + + if (value_type == data_type::real_fp32) { + ONEMKL_CUSPARSE_SUBMIT(float); + } + else if (value_type == data_type::real_fp64) { + ONEMKL_CUSPARSE_SUBMIT(double); + } + else if (value_type == data_type::complex_fp32) { + ONEMKL_CUSPARSE_SUBMIT(std::complex<float>); + } + else if (value_type == data_type::complex_fp64) { + ONEMKL_CUSPARSE_SUBMIT(std::complex<double>); + } + +#undef ONEMKL_CUSPARSE_SUBMIT + + throw oneapi::mkl::exception("sparse_blas", function_name, + "Could not dispatch buffer kernel to a supported type"); + } + else { + return queue.submit([&](sycl::handler& cgh) { + cgh.depends_on(dependencies); + submit_host_task(cgh, queue, functor); + }); + } +} + +/// Helper function for dispatch_submit_impl_fp_int +template <typename Functor, typename... Ts> +sycl::event dispatch_submit(const std::string& function_name, sycl::queue queue, Functor functor, + matrix_handle_t sm_handle, sycl::buffer<std::uint8_t> workspace_buffer, + Ts... other_containers) { + constexpr bool UseWorkspace = true; + constexpr bool UseEnqueueNativeCommandExt = false; + return dispatch_submit_impl_fp_int<UseWorkspace, UseEnqueueNativeCommandExt>( + function_name, queue, {}, functor, sm_handle, workspace_buffer, other_containers...); +} + +/// Helper function for dispatch_submit_impl_fp_int +template <typename Functor, typename... Ts> +sycl::event dispatch_submit(const std::string& function_name, sycl::queue queue, + const std::vector<sycl::event>& dependencies, Functor functor, + matrix_handle_t sm_handle, Ts... other_containers) { + constexpr bool UseWorkspace = false; + constexpr bool UseEnqueueNativeCommandExt = false; + sycl::buffer<std::uint8_t> no_workspace(sycl::range<1>(0)); + return dispatch_submit_impl_fp_int<UseWorkspace, UseEnqueueNativeCommandExt>( + function_name, queue, dependencies, functor, sm_handle, no_workspace, other_containers...); +} + +/// Helper function for dispatch_submit_impl_fp_int +template <typename Functor, typename... Ts> +sycl::event dispatch_submit(const std::string& function_name, sycl::queue queue, Functor functor, + matrix_handle_t sm_handle, Ts... other_containers) { + constexpr bool UseWorkspace = false; + constexpr bool UseEnqueueNativeCommandExt = false; + sycl::buffer<std::uint8_t> no_workspace(sycl::range<1>(0)); + return dispatch_submit_impl_fp_int<UseWorkspace, UseEnqueueNativeCommandExt>( + function_name, queue, {}, functor, sm_handle, no_workspace, other_containers...); +} + +/// Helper function for dispatch_submit_impl_fp_int +template <typename Functor, typename... Ts> +sycl::event dispatch_submit_native_ext(const std::string& function_name, sycl::queue queue, + Functor functor, matrix_handle_t sm_handle, + sycl::buffer<std::uint8_t> workspace_buffer, + Ts... other_containers) { + constexpr bool UseWorkspace = true; +#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND + constexpr bool UseEnqueueNativeCommandExt = true; +#else + constexpr bool UseEnqueueNativeCommandExt = false; +#endif + return dispatch_submit_impl_fp_int<UseWorkspace, UseEnqueueNativeCommandExt>( + function_name, queue, {}, functor, sm_handle, workspace_buffer, other_containers...); +} + +/// Helper function for dispatch_submit_impl_fp_int +template <typename Functor, typename... Ts> +sycl::event dispatch_submit_native_ext(const std::string& function_name, sycl::queue queue, + const std::vector<sycl::event>& dependencies, + Functor functor, matrix_handle_t sm_handle, + Ts... other_containers) { + constexpr bool UseWorkspace = false; +#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND + constexpr bool UseEnqueueNativeCommandExt = true; +#else + constexpr bool UseEnqueueNativeCommandExt = false; +#endif + sycl::buffer<std::uint8_t> no_workspace(sycl::range<1>(0)); + return dispatch_submit_impl_fp_int<UseWorkspace, UseEnqueueNativeCommandExt>( + function_name, queue, dependencies, functor, sm_handle, no_workspace, other_containers...); +} + +/// Helper function for dispatch_submit_impl_fp_int +template <typename Functor, typename... Ts> +sycl::event dispatch_submit_native_ext(const std::string& function_name, sycl::queue queue, + Functor functor, matrix_handle_t sm_handle, + Ts... other_containers) { + constexpr bool UseWorkspace = false; +#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND + constexpr bool UseEnqueueNativeCommandExt = true; +#else + constexpr bool UseEnqueueNativeCommandExt = false; +#endif + sycl::buffer<std::uint8_t> no_workspace(sycl::range<1>(0)); + return dispatch_submit_impl_fp_int<UseWorkspace, UseEnqueueNativeCommandExt>( + function_name, queue, {}, functor, sm_handle, no_workspace, other_containers...); +} + +// Helper function for functors submitted to host_task or native_command. +// When the extension is disabled, host_task are used and the synchronization is needed to ensure the sycl::event corresponds to the end of the whole functor. +// When the extension is enabled, host_task are still used for out-of-order queues, see description of dispatch_submit_impl_fp_int. +inline void synchronize_if_needed(bool is_in_order_queue, CUstream cu_stream) { +#ifndef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND + (void)is_in_order_queue; + CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream); +#else + if (!is_in_order_queue) { + CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream); + } +#endif +} + +} // namespace oneapi::mkl::sparse::cusparse::detail + +#endif // _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_TASKS_HPP_ diff --git a/src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp b/src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp new file mode 100644 index 000000000..278aec296 --- /dev/null +++ b/src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp @@ -0,0 +1,32 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#include "oneapi/mkl/sparse_blas/types.hpp" + +#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp" + +#include "sparse_blas/function_table.hpp" + +#define WRAPPER_VERSION 1 +#define BACKEND cusparse + +extern "C" sparse_blas_function_table_t mkl_sparse_blas_table = { + WRAPPER_VERSION, +#include "sparse_blas/backends/backend_wrappers.cxx" +}; diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp new file mode 100644 index 000000000..5fd24d3f4 --- /dev/null +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp @@ -0,0 +1,336 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp" + +#include "sparse_blas/backends/cusparse/cusparse_error.hpp" +#include "sparse_blas/backends/cusparse/cusparse_helper.hpp" +#include "sparse_blas/backends/cusparse/cusparse_task.hpp" +#include "sparse_blas/backends/cusparse/cusparse_handles.hpp" +#include "sparse_blas/common_op_verification.hpp" +#include "sparse_blas/macros.hpp" +#include "sparse_blas/matrix_view_comparison.hpp" +#include "sparse_blas/sycl_helper.hpp" + +namespace oneapi::mkl::sparse { + +// Complete the definition of the incomplete type +struct spmm_descr { + // Cache the CUstream and global handle to avoid relying on CusparseScopedContextHandler to retrieve them. + // cuSPARSE seem to implicitly require to use the same CUstream for a whole operation (buffer_size, optimization and computation steps). + // This is needed as the default SYCL queue is out-of-order which can have a different CUstream for each host_task or native_command. + CUstream cu_stream; + cusparseHandle_t cu_handle; + + detail::generic_container workspace; + std::size_t temp_buffer_size = 0; + bool buffer_size_called = false; + bool optimized_called = false; + oneapi::mkl::transpose last_optimized_opA; + oneapi::mkl::transpose last_optimized_opB; + matrix_view last_optimized_A_view; + matrix_handle_t last_optimized_A_handle; + dense_matrix_handle_t last_optimized_B_handle; + dense_matrix_handle_t last_optimized_C_handle; + spmm_alg last_optimized_alg; +}; + +} // namespace oneapi::mkl::sparse + +namespace oneapi::mkl::sparse::cusparse { + +namespace detail { + +inline auto get_cuda_spmm_alg(spmm_alg alg) { + switch (alg) { + case spmm_alg::coo_alg1: return CUSPARSE_SPMM_COO_ALG1; + case spmm_alg::coo_alg2: return CUSPARSE_SPMM_COO_ALG2; + case spmm_alg::coo_alg3: return CUSPARSE_SPMM_COO_ALG3; + case spmm_alg::coo_alg4: return CUSPARSE_SPMM_COO_ALG4; + case spmm_alg::csr_alg1: return CUSPARSE_SPMM_CSR_ALG1; + case spmm_alg::csr_alg2: return CUSPARSE_SPMM_CSR_ALG2; + case spmm_alg::csr_alg3: return CUSPARSE_SPMM_CSR_ALG3; + default: return CUSPARSE_SPMM_ALG_DEFAULT; + } +} + +void check_valid_spmm(const std::string& function_name, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, dense_matrix_handle_t C_handle, + bool is_alpha_host_accessible, bool is_beta_host_accessible, spmm_alg alg) { + check_valid_spmm_common(function_name, A_view, A_handle, B_handle, C_handle, + is_alpha_host_accessible, is_beta_host_accessible); + check_valid_matrix_properties(function_name, A_handle); + if (alg == spmm_alg::csr_alg3 && opA != oneapi::mkl::transpose::nontrans) { + throw mkl::unimplemented( + "sparse_blas", function_name, + "The backend does not support spmm with the algorithm `spmm_alg::csr_alg3` if `opA` is not `transpose::nontrans`."); + } + if (alg == spmm_alg::csr_alg3 && opB == oneapi::mkl::transpose::conjtrans) { + throw mkl::unimplemented( + "sparse_blas", function_name, + "The backend does not support spmm with the algorithm `spmm_alg::csr_alg3` if `opB` is `transpose::conjtrans`."); + } + if (alg == spmm_alg::csr_alg3 && opB == oneapi::mkl::transpose::trans && + A_handle->get_value_type() == data_type::real_fp64) { + // TODO: Remove once the issue is fixed: https://forums.developer.nvidia.com/t/cusparse-spmm-sample-failing-with-misaligned-address/311022 + throw mkl::unimplemented( + "sparse_blas", function_name, + "The backend does not support spmm with the algorithm `spmm_alg::csr_alg3` if `opB` is `transpose::trans` and the real fp64 precision is used."); + } +} + +inline void common_spmm_optimize(oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + bool is_alpha_host_accessible, matrix_view A_view, + matrix_handle_t A_handle, dense_matrix_handle_t B_handle, + bool is_beta_host_accessible, dense_matrix_handle_t C_handle, + spmm_alg alg, spmm_descr_t spmm_descr) { + check_valid_spmm("spmm_optimize", opA, opB, A_view, A_handle, B_handle, C_handle, + is_alpha_host_accessible, is_beta_host_accessible, alg); + if (!spmm_descr->buffer_size_called) { + throw mkl::uninitialized("sparse_blas", "spmm_optimize", + "spmm_buffer_size must be called before spmm_optimize."); + } + spmm_descr->optimized_called = true; + spmm_descr->last_optimized_opA = opA; + spmm_descr->last_optimized_opB = opB; + spmm_descr->last_optimized_A_view = A_view; + spmm_descr->last_optimized_A_handle = A_handle; + spmm_descr->last_optimized_B_handle = B_handle; + spmm_descr->last_optimized_C_handle = C_handle; + spmm_descr->last_optimized_alg = alg; +} + +void spmm_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void* alpha, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, + dense_matrix_handle_t C_handle, spmm_alg alg, void* workspace_ptr, + bool is_alpha_host_accessible) { + auto cu_a = A_handle->backend_handle; + auto cu_b = B_handle->backend_handle; + auto cu_c = C_handle->backend_handle; + auto type = A_handle->value_container.data_type; + auto cu_op_a = get_cuda_operation(type, opA); + auto cu_op_b = get_cuda_operation(type, opB); + auto cu_type = get_cuda_value_type(type); + auto cu_alg = get_cuda_spmm_alg(alg); + set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpMM_preprocess(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta, + cu_c, cu_type, cu_alg, workspace_ptr); + check_status(status, "spmm_optimize"); +} + +} // namespace detail + +void init_spmm_descr(sycl::queue& /*queue*/, spmm_descr_t* p_spmm_descr) { + *p_spmm_descr = new spmm_descr(); +} + +sycl::event release_spmm_descr(sycl::queue& queue, spmm_descr_t spmm_descr, + const std::vector<sycl::event>& dependencies) { + if (!spmm_descr) { + return detail::collapse_dependencies(queue, dependencies); + } + + auto release_functor = [=]() { + spmm_descr->cu_handle = nullptr; + spmm_descr->last_optimized_A_handle = nullptr; + spmm_descr->last_optimized_B_handle = nullptr; + spmm_descr->last_optimized_C_handle = nullptr; + delete spmm_descr; + }; + + // Use dispatch_submit to ensure the descriptor is kept alive as long as the buffers are used + // dispatch_submit can only be used if the descriptor's handles are valid + if (spmm_descr->last_optimized_A_handle && + spmm_descr->last_optimized_A_handle->all_use_buffer() && + spmm_descr->last_optimized_B_handle && spmm_descr->last_optimized_C_handle && + spmm_descr->workspace.use_buffer()) { + auto dispatch_functor = [=](sycl::interop_handle, sycl::accessor<std::uint8_t>) { + release_functor(); + }; + return detail::dispatch_submit( + __func__, queue, dispatch_functor, spmm_descr->last_optimized_A_handle, + spmm_descr->workspace.get_buffer<std::uint8_t>(), spmm_descr->last_optimized_B_handle, + spmm_descr->last_optimized_C_handle); + } + + // Release used if USM is used or if the descriptor has been released before spmm_optimize has succeeded + sycl::event event = queue.submit([&](sycl::handler& cgh) { + cgh.depends_on(dependencies); + cgh.host_task(release_functor); + }); + return event; +} + +void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, + dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr, + std::size_t& temp_buffer_size) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); + detail::check_valid_spmm(__func__, opA, opB, A_view, A_handle, B_handle, C_handle, + is_alpha_host_accessible, is_beta_host_accessible, alg); + auto functor = [=, &temp_buffer_size](sycl::interop_handle ih) { + detail::CusparseScopedContextHandler sc(queue, ih); + auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue); + spmm_descr->cu_handle = cu_handle; + spmm_descr->cu_stream = cu_stream; + auto cu_a = A_handle->backend_handle; + auto cu_b = B_handle->backend_handle; + auto cu_c = C_handle->backend_handle; + auto type = A_handle->value_container.data_type; + auto cu_op_a = detail::get_cuda_operation(type, opA); + auto cu_op_b = detail::get_cuda_operation(type, opB); + auto cu_type = detail::get_cuda_value_type(type); + auto cu_alg = detail::get_cuda_spmm_alg(alg); + detail::set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpMM_bufferSize(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta, + cu_c, cu_type, cu_alg, &temp_buffer_size); + detail::check_status(status, __func__); + }; + auto event = detail::dispatch_submit(__func__, queue, functor, A_handle, B_handle, C_handle); + event.wait_and_throw(); + spmm_descr->temp_buffer_size = temp_buffer_size; + spmm_descr->buffer_size_called = true; +} + +void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, + spmm_alg alg, spmm_descr_t spmm_descr, sycl::buffer<std::uint8_t, 1> workspace) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); + if (!A_handle->all_use_buffer()) { + detail::throw_incompatible_container(__func__); + } + detail::common_spmm_optimize(opA, opB, is_alpha_host_accessible, A_view, A_handle, B_handle, + is_beta_host_accessible, C_handle, alg, spmm_descr); + // Copy the buffer to extend its lifetime until the descriptor is free'd. + spmm_descr->workspace.set_buffer_untyped(workspace); + if (alg == spmm_alg::no_optimize_alg || workspace.size() == 0) { + // cusparseSpMM_preprocess cannot be called if the workspace is empty + return; + } + auto functor = [=](sycl::interop_handle ih, sycl::accessor<std::uint8_t> workspace_acc) { + auto cu_handle = spmm_descr->cu_handle; + auto workspace_ptr = detail::get_mem(ih, workspace_acc); + detail::spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle, + alg, workspace_ptr, is_alpha_host_accessible); + }; + + detail::dispatch_submit(__func__, queue, functor, A_handle, workspace, B_handle, C_handle); +} + +sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view, + matrix_handle_t A_handle, dense_matrix_handle_t B_handle, + const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, + spmm_descr_t spmm_descr, void* workspace, + const std::vector<sycl::event>& dependencies) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); + if (A_handle->all_use_buffer()) { + detail::throw_incompatible_container(__func__); + } + detail::common_spmm_optimize(opA, opB, is_alpha_host_accessible, A_view, A_handle, B_handle, + is_beta_host_accessible, C_handle, alg, spmm_descr); + spmm_descr->workspace.usm_ptr = workspace; + if (alg == spmm_alg::no_optimize_alg || workspace == nullptr) { + // cusparseSpMM_preprocess cannot be called if the workspace is empty + return detail::collapse_dependencies(queue, dependencies); + } + auto functor = [=](sycl::interop_handle) { + auto cu_handle = spmm_descr->cu_handle; + detail::spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle, + alg, workspace, is_alpha_host_accessible); + }; + + return detail::dispatch_submit(__func__, queue, dependencies, functor, A_handle, B_handle, + C_handle); +} + +sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, + spmm_alg alg, spmm_descr_t spmm_descr, + const std::vector<sycl::event>& dependencies) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); + detail::check_valid_spmm(__func__, opA, opB, A_view, A_handle, B_handle, C_handle, + is_alpha_host_accessible, is_beta_host_accessible, alg); + if (A_handle->all_use_buffer() != spmm_descr->workspace.use_buffer()) { + detail::throw_incompatible_container(__func__); + } + + if (!spmm_descr->optimized_called) { + throw mkl::uninitialized("sparse_blas", __func__, + "spmm_optimize must be called before spmm."); + } + CHECK_DESCR_MATCH(spmm_descr, opA, "spmm_optimize"); + CHECK_DESCR_MATCH(spmm_descr, opB, "spmm_optimize"); + CHECK_DESCR_MATCH(spmm_descr, A_view, "spmm_optimize"); + CHECK_DESCR_MATCH(spmm_descr, A_handle, "spmm_optimize"); + CHECK_DESCR_MATCH(spmm_descr, B_handle, "spmm_optimize"); + CHECK_DESCR_MATCH(spmm_descr, C_handle, "spmm_optimize"); + CHECK_DESCR_MATCH(spmm_descr, alg, "spmm_optimize"); + + bool is_in_order_queue = queue.is_in_order(); + auto compute_functor = [=](void* workspace_ptr) { + auto cu_handle = spmm_descr->cu_handle; + auto cu_a = A_handle->backend_handle; + auto cu_b = B_handle->backend_handle; + auto cu_c = C_handle->backend_handle; + auto type = A_handle->value_container.data_type; + auto cu_op_a = detail::get_cuda_operation(type, opA); + auto cu_op_b = detail::get_cuda_operation(type, opB); + auto cu_type = detail::get_cuda_value_type(type); + auto cu_alg = detail::get_cuda_spmm_alg(alg); + detail::set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpMM(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta, cu_c, + cu_type, cu_alg, workspace_ptr); + detail::check_status(status, __func__); + detail::synchronize_if_needed(is_in_order_queue, spmm_descr->cu_stream); + }; + if (A_handle->all_use_buffer() && spmm_descr->temp_buffer_size > 0) { + // The accessor can only be created if the buffer size is greater than 0 + auto functor_buffer = [=](sycl::interop_handle ih, + sycl::accessor<std::uint8_t> workspace_acc) { + auto workspace_ptr = detail::get_mem(ih, workspace_acc); + compute_functor(workspace_ptr); + }; + return detail::dispatch_submit_native_ext(__func__, queue, functor_buffer, A_handle, + spmm_descr->workspace.get_buffer<std::uint8_t>(), + B_handle, C_handle); + } + else { + // The same dispatch_submit can be used for USM or buffers if no + // workspace accessor is needed, workspace_ptr will be a nullptr in the + // latter case. + auto workspace_ptr = spmm_descr->workspace.usm_ptr; + auto functor_usm = [=](sycl::interop_handle) { + compute_functor(workspace_ptr); + }; + return detail::dispatch_submit_native_ext(__func__, queue, dependencies, functor_usm, + A_handle, B_handle, C_handle); + } +} + +} // namespace oneapi::mkl::sparse::cusparse diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp new file mode 100644 index 000000000..03b848916 --- /dev/null +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp @@ -0,0 +1,335 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp" + +#include "sparse_blas/backends/cusparse/cusparse_error.hpp" +#include "sparse_blas/backends/cusparse/cusparse_helper.hpp" +#include "sparse_blas/backends/cusparse/cusparse_task.hpp" +#include "sparse_blas/backends/cusparse/cusparse_handles.hpp" +#include "sparse_blas/common_op_verification.hpp" +#include "sparse_blas/macros.hpp" +#include "sparse_blas/matrix_view_comparison.hpp" +#include "sparse_blas/sycl_helper.hpp" + +namespace oneapi::mkl::sparse { + +// Complete the definition of the incomplete type +struct spmv_descr { + // Cache the CUstream and global handle to avoid relying on CusparseScopedContextHandler to retrieve them. + // cuSPARSE seem to implicitly require to use the same CUstream for a whole operation (buffer_size, optimization and computation steps). + // This is needed as the default SYCL queue is out-of-order which can have a different CUstream for each host_task or native_command. + CUstream cu_stream; + cusparseHandle_t cu_handle; + + detail::generic_container workspace; + std::size_t temp_buffer_size = 0; + bool buffer_size_called = false; + bool optimized_called = false; + oneapi::mkl::transpose last_optimized_opA; + matrix_view last_optimized_A_view; + matrix_handle_t last_optimized_A_handle; + dense_vector_handle_t last_optimized_x_handle; + dense_vector_handle_t last_optimized_y_handle; + spmv_alg last_optimized_alg; +}; + +} // namespace oneapi::mkl::sparse + +namespace oneapi::mkl::sparse::cusparse { + +namespace detail { + +inline auto get_cuda_spmv_alg(spmv_alg alg) { + switch (alg) { + case spmv_alg::coo_alg1: return CUSPARSE_SPMV_COO_ALG1; + case spmv_alg::coo_alg2: return CUSPARSE_SPMV_COO_ALG2; + case spmv_alg::csr_alg1: return CUSPARSE_SPMV_CSR_ALG1; + case spmv_alg::csr_alg2: return CUSPARSE_SPMV_CSR_ALG2; + default: return CUSPARSE_SPMV_ALG_DEFAULT; + } +} + +void check_valid_spmv(const std::string& function_name, oneapi::mkl::transpose opA, + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, bool is_alpha_host_accessible, + bool is_beta_host_accessible) { + check_valid_spmv_common(function_name, opA, A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible, is_beta_host_accessible); + check_valid_matrix_properties(function_name, A_handle); + if (A_view.type_view != matrix_descr::general) { + throw mkl::unimplemented( + "sparse_blas", function_name, + "The backend does not support spmv with a `type_view` other than `matrix_descr::general`."); + } +} + +inline void common_spmv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_accessible, + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, bool is_beta_host_accessible, + dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr) { + check_valid_spmv("spmv_optimize", opA, A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible, is_beta_host_accessible); + if (!spmv_descr->buffer_size_called) { + throw mkl::uninitialized("sparse_blas", "spmv_optimize", + "spmv_buffer_size must be called before spmv_optimize."); + } + spmv_descr->optimized_called = true; + spmv_descr->last_optimized_opA = opA; + spmv_descr->last_optimized_A_view = A_view; + spmv_descr->last_optimized_A_handle = A_handle; + spmv_descr->last_optimized_x_handle = x_handle; + spmv_descr->last_optimized_y_handle = y_handle; + spmv_descr->last_optimized_alg = alg; +} + +#if CUSPARSE_VERSION >= 12300 +// cusparseSpMV_preprocess was added in cuSPARSE 12.3.0.142 (CUDA 12.4) +void spmv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, const void* alpha, + matrix_handle_t A_handle, dense_vector_handle_t x_handle, const void* beta, + dense_vector_handle_t y_handle, spmv_alg alg, void* workspace_ptr, + bool is_alpha_host_accessible) { + auto cu_a = A_handle->backend_handle; + auto cu_x = x_handle->backend_handle; + auto cu_y = y_handle->backend_handle; + auto type = A_handle->value_container.data_type; + auto cu_op = get_cuda_operation(type, opA); + auto cu_type = get_cuda_value_type(type); + auto cu_alg = get_cuda_spmv_alg(alg); + set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpMV_preprocess(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y, cu_type, + cu_alg, workspace_ptr); + check_status(status, "spmv_optimize"); +} +#endif + +} // namespace detail + +void init_spmv_descr(sycl::queue& /*queue*/, spmv_descr_t* p_spmv_descr) { + *p_spmv_descr = new spmv_descr(); +} + +sycl::event release_spmv_descr(sycl::queue& queue, spmv_descr_t spmv_descr, + const std::vector<sycl::event>& dependencies) { + if (!spmv_descr) { + return detail::collapse_dependencies(queue, dependencies); + } + + auto release_functor = [=]() { + spmv_descr->cu_handle = nullptr; + spmv_descr->last_optimized_A_handle = nullptr; + spmv_descr->last_optimized_x_handle = nullptr; + spmv_descr->last_optimized_y_handle = nullptr; + delete spmv_descr; + }; + + // Use dispatch_submit to ensure the descriptor is kept alive as long as the buffers are used + // dispatch_submit can only be used if the descriptor's handles are valid + if (spmv_descr->last_optimized_A_handle && + spmv_descr->last_optimized_A_handle->all_use_buffer() && + spmv_descr->last_optimized_x_handle && spmv_descr->last_optimized_y_handle && + spmv_descr->workspace.use_buffer()) { + auto dispatch_functor = [=](sycl::interop_handle, sycl::accessor<std::uint8_t>) { + release_functor(); + }; + return detail::dispatch_submit( + __func__, queue, dispatch_functor, spmv_descr->last_optimized_A_handle, + spmv_descr->workspace.get_buffer<std::uint8_t>(), spmv_descr->last_optimized_x_handle, + spmv_descr->last_optimized_y_handle); + } + + // Release used if USM is used or if the descriptor has been released before spmv_optimize has succeeded + sycl::event event = queue.submit([&](sycl::handler& cgh) { + cgh.depends_on(dependencies); + cgh.host_task(release_functor); + }); + return event; +} + +void spmv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, std::size_t& temp_buffer_size) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); + detail::check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible, is_beta_host_accessible); + + auto functor = [=, &temp_buffer_size](sycl::interop_handle ih) { + detail::CusparseScopedContextHandler sc(queue, ih); + auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue); + spmv_descr->cu_handle = cu_handle; + spmv_descr->cu_stream = cu_stream; + auto cu_a = A_handle->backend_handle; + auto cu_x = x_handle->backend_handle; + auto cu_y = y_handle->backend_handle; + auto type = A_handle->value_container.data_type; + auto cu_op = detail::get_cuda_operation(type, opA); + auto cu_type = detail::get_cuda_value_type(type); + auto cu_alg = detail::get_cuda_spmv_alg(alg); + detail::set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpMV_bufferSize(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y, + cu_type, cu_alg, &temp_buffer_size); + detail::check_status(status, __func__); + }; + auto event = detail::dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle); + event.wait_and_throw(); + spmv_descr->temp_buffer_size = temp_buffer_size; + spmv_descr->buffer_size_called = true; +} + +void spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, sycl::buffer<std::uint8_t, 1> workspace) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); + if (!A_handle->all_use_buffer()) { + detail::throw_incompatible_container(__func__); + } + detail::common_spmv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, + is_beta_host_accessible, y_handle, alg, spmv_descr); + // Copy the buffer to extend its lifetime until the descriptor is free'd. + spmv_descr->workspace.set_buffer_untyped(workspace); + if (alg == spmv_alg::no_optimize_alg) { + return; + } + +#if CUSPARSE_VERSION < 12300 + // cusparseSpMV_preprocess was added in cuSPARSE 12.3.0.142 (CUDA 12.4) + return; +#else + if (spmv_descr->temp_buffer_size > 0) { + auto functor = [=](sycl::interop_handle ih, sycl::accessor<std::uint8_t> workspace_acc) { + auto cu_handle = spmv_descr->cu_handle; + auto workspace_ptr = detail::get_mem(ih, workspace_acc); + detail::spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, + alg, workspace_ptr, is_alpha_host_accessible); + }; + + // The accessor can only be created if the buffer size is greater than 0 + detail::dispatch_submit(__func__, queue, functor, A_handle, workspace, x_handle, y_handle); + } + else { + auto functor = [=](sycl::interop_handle) { + auto cu_handle = spmv_descr->cu_handle; + detail::spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, + alg, nullptr, is_alpha_host_accessible); + }; + detail::dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle); + } +#endif +} + +sycl::event spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, const void* beta, + dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr, + void* workspace, const std::vector<sycl::event>& dependencies) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); + if (A_handle->all_use_buffer()) { + detail::throw_incompatible_container(__func__); + } + detail::common_spmv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, + is_beta_host_accessible, y_handle, alg, spmv_descr); + spmv_descr->workspace.usm_ptr = workspace; + if (alg == spmv_alg::no_optimize_alg) { + return detail::collapse_dependencies(queue, dependencies); + } + +#if CUSPARSE_VERSION < 12300 + // cusparseSpMV_preprocess was added in cuSPARSE 12.3.0.142 (CUDA 12.4) + return detail::collapse_dependencies(queue, dependencies); +#else + auto functor = [=](sycl::interop_handle) { + auto cu_handle = spmv_descr->cu_handle; + detail::spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, alg, + workspace, is_alpha_host_accessible); + }; + return detail::dispatch_submit(__func__, queue, dependencies, functor, A_handle, x_handle, + y_handle); +#endif +} + +sycl::event spmv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, const std::vector<sycl::event>& dependencies) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); + detail::check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible, is_beta_host_accessible); + if (A_handle->all_use_buffer() != spmv_descr->workspace.use_buffer()) { + detail::throw_incompatible_container(__func__); + } + + if (!spmv_descr->optimized_called) { + throw mkl::uninitialized("sparse_blas", __func__, + "spmv_optimize must be called before spmv."); + } + CHECK_DESCR_MATCH(spmv_descr, opA, "spmv_optimize"); + CHECK_DESCR_MATCH(spmv_descr, A_view, "spmv_optimize"); + CHECK_DESCR_MATCH(spmv_descr, A_handle, "spmv_optimize"); + CHECK_DESCR_MATCH(spmv_descr, x_handle, "spmv_optimize"); + CHECK_DESCR_MATCH(spmv_descr, y_handle, "spmv_optimize"); + CHECK_DESCR_MATCH(spmv_descr, alg, "spmv_optimize"); + + bool is_in_order_queue = queue.is_in_order(); + auto compute_functor = [=](void* workspace_ptr) { + auto cu_handle = spmv_descr->cu_handle; + auto cu_a = A_handle->backend_handle; + auto cu_x = x_handle->backend_handle; + auto cu_y = y_handle->backend_handle; + auto type = A_handle->value_container.data_type; + auto cu_op = detail::get_cuda_operation(type, opA); + auto cu_type = detail::get_cuda_value_type(type); + auto cu_alg = detail::get_cuda_spmv_alg(alg); + detail::set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpMV(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y, cu_type, cu_alg, + workspace_ptr); + detail::check_status(status, __func__); + detail::synchronize_if_needed(is_in_order_queue, spmv_descr->cu_stream); + }; + if (A_handle->all_use_buffer() && spmv_descr->temp_buffer_size > 0) { + // The accessor can only be created if the buffer size is greater than 0 + auto functor_buffer = [=](sycl::interop_handle ih, + sycl::accessor<std::uint8_t> workspace_acc) { + auto workspace_ptr = detail::get_mem(ih, workspace_acc); + compute_functor(workspace_ptr); + }; + return detail::dispatch_submit_native_ext(__func__, queue, functor_buffer, A_handle, + spmv_descr->workspace.get_buffer<std::uint8_t>(), + x_handle, y_handle); + } + else { + // The same dispatch_submit can be used for USM or buffers if no + // workspace accessor is needed, workspace_ptr will be a nullptr in the + // latter case. + auto workspace_ptr = spmv_descr->workspace.usm_ptr; + auto functor_usm = [=](sycl::interop_handle) { + compute_functor(workspace_ptr); + }; + return detail::dispatch_submit_native_ext(__func__, queue, dependencies, functor_usm, + A_handle, x_handle, y_handle); + } +} + +} // namespace oneapi::mkl::sparse::cusparse diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp new file mode 100644 index 000000000..5c49df013 --- /dev/null +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp @@ -0,0 +1,289 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp" + +#include "sparse_blas/backends/cusparse/cusparse_error.hpp" +#include "sparse_blas/backends/cusparse/cusparse_helper.hpp" +#include "sparse_blas/backends/cusparse/cusparse_task.hpp" +#include "sparse_blas/backends/cusparse/cusparse_handles.hpp" +#include "sparse_blas/common_op_verification.hpp" +#include "sparse_blas/macros.hpp" +#include "sparse_blas/matrix_view_comparison.hpp" +#include "sparse_blas/sycl_helper.hpp" + +namespace oneapi::mkl::sparse { + +// Complete the definition of the incomplete type +struct spsv_descr { + // Cache the CUstream and global handle to avoid relying on CusparseScopedContextHandler to retrieve them. + // cuSPARSE seem to implicitly require to use the same CUstream for a whole operation (buffer_size, optimization and computation steps). + // This is needed as the default SYCL queue is out-of-order which can have a different CUstream for each host_task or native_command. + CUstream cu_stream; + cusparseHandle_t cu_handle; + + cusparseSpSVDescr_t cu_descr; + detail::generic_container workspace; + bool buffer_size_called = false; + bool optimized_called = false; + oneapi::mkl::transpose last_optimized_opA; + matrix_view last_optimized_A_view; + matrix_handle_t last_optimized_A_handle; + dense_vector_handle_t last_optimized_x_handle; + dense_vector_handle_t last_optimized_y_handle; + spsv_alg last_optimized_alg; +}; + +} // namespace oneapi::mkl::sparse + +namespace oneapi::mkl::sparse::cusparse { + +namespace detail { + +inline auto get_cuda_spsv_alg(spsv_alg /*alg*/) { + return CUSPARSE_SPSV_ALG_DEFAULT; +} + +void check_valid_spsv(const std::string& function_name, matrix_view A_view, + matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, bool is_alpha_host_accessible) { + check_valid_spsv_common(function_name, A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible); + check_valid_matrix_properties(function_name, A_handle); +} + +inline void common_spsv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_accessible, + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, + spsv_alg alg, spsv_descr_t spsv_descr) { + check_valid_spsv("spsv_optimize", A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible); + if (!spsv_descr->buffer_size_called) { + throw mkl::uninitialized("sparse_blas", "spsv_optimize", + "spsv_buffer_size must be called before spsv_optimize."); + } + spsv_descr->optimized_called = true; + spsv_descr->last_optimized_opA = opA; + spsv_descr->last_optimized_A_view = A_view; + spsv_descr->last_optimized_A_handle = A_handle; + spsv_descr->last_optimized_x_handle = x_handle; + spsv_descr->last_optimized_y_handle = y_handle; + spsv_descr->last_optimized_alg = alg; +} + +void spsv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, const void* alpha, + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, + spsv_alg alg, spsv_descr_t spsv_descr, void* workspace_ptr, + bool is_alpha_host_accessible) { + auto cu_a = A_handle->backend_handle; + auto cu_x = x_handle->backend_handle; + auto cu_y = y_handle->backend_handle; + auto type = A_handle->value_container.data_type; + set_matrix_attributes("spsv_optimize", cu_a, A_view); + auto cu_op = get_cuda_operation(type, opA); + auto cu_type = get_cuda_value_type(type); + auto cu_alg = get_cuda_spsv_alg(alg); + auto cu_descr = spsv_descr->cu_descr; + set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpSV_analysis(cu_handle, cu_op, alpha, cu_a, cu_x, cu_y, cu_type, cu_alg, + cu_descr, workspace_ptr); + check_status(status, "spsv_optimize"); +} + +} // namespace detail + +void init_spsv_descr(sycl::queue& /*queue*/, spsv_descr_t* p_spsv_descr) { + *p_spsv_descr = new spsv_descr(); + CUSPARSE_ERR_FUNC(cusparseSpSV_createDescr, &(*p_spsv_descr)->cu_descr); +} + +sycl::event release_spsv_descr(sycl::queue& queue, spsv_descr_t spsv_descr, + const std::vector<sycl::event>& dependencies) { + if (!spsv_descr) { + return detail::collapse_dependencies(queue, dependencies); + } + + auto release_functor = [=]() { + CUSPARSE_ERR_FUNC(cusparseSpSV_destroyDescr, spsv_descr->cu_descr); + spsv_descr->cu_handle = nullptr; + spsv_descr->cu_descr = nullptr; + spsv_descr->last_optimized_A_handle = nullptr; + spsv_descr->last_optimized_x_handle = nullptr; + spsv_descr->last_optimized_y_handle = nullptr; + delete spsv_descr; + }; + + // Use dispatch_submit to ensure the descriptor is kept alive as long as the buffers are used + // dispatch_submit can only be used if the descriptor's handles are valid + if (spsv_descr->last_optimized_A_handle && + spsv_descr->last_optimized_A_handle->all_use_buffer() && + spsv_descr->last_optimized_x_handle && spsv_descr->last_optimized_y_handle && + spsv_descr->workspace.use_buffer()) { + auto dispatch_functor = [=](sycl::interop_handle, sycl::accessor<std::uint8_t>) { + release_functor(); + }; + return detail::dispatch_submit( + __func__, queue, dispatch_functor, spsv_descr->last_optimized_A_handle, + spsv_descr->workspace.get_buffer<std::uint8_t>(), spsv_descr->last_optimized_x_handle, + spsv_descr->last_optimized_y_handle); + } + + // Release used if USM is used or if the descriptor has been released before spsv_optimize has succeeded + sycl::event event = queue.submit([&](sycl::handler& cgh) { + cgh.depends_on(dependencies); + cgh.host_task(release_functor); + }); + return event; +} + +void spsv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, + std::size_t& temp_buffer_size) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + detail::check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible); + auto functor = [=, &temp_buffer_size](sycl::interop_handle ih) { + detail::CusparseScopedContextHandler sc(queue, ih); + auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue); + spsv_descr->cu_handle = cu_handle; + spsv_descr->cu_stream = cu_stream; + auto cu_a = A_handle->backend_handle; + auto cu_x = x_handle->backend_handle; + auto cu_y = y_handle->backend_handle; + auto type = A_handle->value_container.data_type; + detail::set_matrix_attributes(__func__, cu_a, A_view); + auto cu_op = detail::get_cuda_operation(type, opA); + auto cu_type = detail::get_cuda_value_type(type); + auto cu_alg = detail::get_cuda_spsv_alg(alg); + auto cu_descr = spsv_descr->cu_descr; + detail::set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpSV_bufferSize(cu_handle, cu_op, alpha, cu_a, cu_x, cu_y, cu_type, + cu_alg, cu_descr, &temp_buffer_size); + detail::check_status(status, __func__); + }; + auto event = detail::dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle); + event.wait_and_throw(); + spsv_descr->buffer_size_called = true; +} + +void spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, + sycl::buffer<std::uint8_t, 1> workspace) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + if (!A_handle->all_use_buffer()) { + detail::throw_incompatible_container(__func__); + } + detail::common_spsv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, + y_handle, alg, spsv_descr); + // Ignore spsv_alg::no_optimize_alg as this step is mandatory for cuSPARSE + // Copy the buffer to extend its lifetime until the descriptor is free'd. + spsv_descr->workspace.set_buffer_untyped(workspace); + + if (workspace.size() > 0) { + auto functor = [=](sycl::interop_handle ih, sycl::accessor<std::uint8_t> workspace_acc) { + auto cu_handle = spsv_descr->cu_handle; + auto workspace_ptr = detail::get_mem(ih, workspace_acc); + detail::spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, + alg, spsv_descr, workspace_ptr, is_alpha_host_accessible); + }; + + // The accessor can only be created if the buffer size is greater than 0 + detail::dispatch_submit(__func__, queue, functor, A_handle, workspace, x_handle, y_handle); + } + else { + auto functor = [=](sycl::interop_handle) { + auto cu_handle = spsv_descr->cu_handle; + detail::spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, + alg, spsv_descr, nullptr, is_alpha_host_accessible); + }; + + detail::dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle); + } +} + +sycl::event spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, + spsv_alg alg, spsv_descr_t spsv_descr, void* workspace, + const std::vector<sycl::event>& dependencies) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + if (A_handle->all_use_buffer()) { + detail::throw_incompatible_container(__func__); + } + detail::common_spsv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, + y_handle, alg, spsv_descr); + // Ignore spsv_alg::no_optimize_alg as this step is mandatory for cuSPARSE + auto functor = [=](sycl::interop_handle) { + auto cu_handle = spsv_descr->cu_handle; + detail::spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, + spsv_descr, workspace, is_alpha_host_accessible); + }; + // No need to store the workspace USM pointer as the backend stores it already + return detail::dispatch_submit(__func__, queue, dependencies, functor, A_handle, x_handle, + y_handle); +} + +sycl::event spsv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, + const std::vector<sycl::event>& dependencies) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + detail::check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible); + if (A_handle->all_use_buffer() != spsv_descr->workspace.use_buffer()) { + detail::throw_incompatible_container(__func__); + } + + if (!spsv_descr->optimized_called) { + throw mkl::uninitialized("sparse_blas", __func__, + "spsv_optimize must be called before spsv."); + } + CHECK_DESCR_MATCH(spsv_descr, opA, "spsv_optimize"); + CHECK_DESCR_MATCH(spsv_descr, A_view, "spsv_optimize"); + CHECK_DESCR_MATCH(spsv_descr, A_handle, "spsv_optimize"); + CHECK_DESCR_MATCH(spsv_descr, x_handle, "spsv_optimize"); + CHECK_DESCR_MATCH(spsv_descr, y_handle, "spsv_optimize"); + CHECK_DESCR_MATCH(spsv_descr, alg, "spsv_optimize"); + + bool is_in_order_queue = queue.is_in_order(); + auto functor = [=](sycl::interop_handle) { + auto cu_handle = spsv_descr->cu_handle; + auto cu_a = A_handle->backend_handle; + auto cu_x = x_handle->backend_handle; + auto cu_y = y_handle->backend_handle; + auto type = A_handle->value_container.data_type; + detail::set_matrix_attributes(__func__, cu_a, A_view); + auto cu_op = detail::get_cuda_operation(type, opA); + auto cu_type = detail::get_cuda_value_type(type); + auto cu_alg = detail::get_cuda_spsv_alg(alg); + auto cu_descr = spsv_descr->cu_descr; + detail::set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpSV_solve(cu_handle, cu_op, alpha, cu_a, cu_x, cu_y, cu_type, cu_alg, + cu_descr); + detail::check_status(status, __func__); + detail::synchronize_if_needed(is_in_order_queue, spsv_descr->cu_stream); + }; + return detail::dispatch_submit_native_ext(__func__, queue, dependencies, functor, A_handle, + x_handle, y_handle); +} + +} // namespace oneapi::mkl::sparse::cusparse diff --git a/src/sparse_blas/backends/mkl_common/mkl_dispatch.hpp b/src/sparse_blas/backends/mkl_common/mkl_dispatch.hpp new file mode 100644 index 000000000..28c628438 --- /dev/null +++ b/src/sparse_blas/backends/mkl_common/mkl_dispatch.hpp @@ -0,0 +1,37 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#ifndef _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_MKL_COMMON_MKL_DISPATCH_HPP_ +#define _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_MKL_COMMON_MKL_DISPATCH_HPP_ + +/// Convert \p value_type to template type argument and use it to call \p op_functor. +#define DISPATCH_MKL_OPERATION(function_name, value_type, op_functor, ...) \ + switch (value_type) { \ + case detail::data_type::real_fp32: return op_functor<float>(__VA_ARGS__); \ + case detail::data_type::real_fp64: return op_functor<double>(__VA_ARGS__); \ + case detail::data_type::complex_fp32: return op_functor<std::complex<float>>(__VA_ARGS__); \ + case detail::data_type::complex_fp64: \ + return op_functor<std::complex<double>>(__VA_ARGS__); \ + default: \ + throw oneapi::mkl::exception( \ + "sparse_blas", function_name, \ + "Internal error: unsupported type " + data_type_to_str(value_type)); \ + } + +#endif // _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_MKL_COMMON_MKL_DISPATCH_HPP_ diff --git a/src/sparse_blas/backends/mkl_common/mkl_handles.cxx b/src/sparse_blas/backends/mkl_common/mkl_handles.cxx index 1e4ab95f1..5fa5ea0a4 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_handles.cxx +++ b/src/sparse_blas/backends/mkl_common/mkl_handles.cxx @@ -17,102 +17,64 @@ * **************************************************************************/ +// In this file functions and types using the namespace oneapi::mkl::sparse:: refer to the backend's namespace for better readability. + // Dense vector template <typename fpType> -void init_dense_vector(sycl::queue& /*queue*/, - oneapi::mkl::sparse::dense_vector_handle_t* p_dvhandle, std::int64_t size, +void init_dense_vector(sycl::queue& /*queue*/, dense_vector_handle_t* p_dvhandle, std::int64_t size, sycl::buffer<fpType, 1> val) { - *p_dvhandle = new oneapi::mkl::sparse::dense_vector_handle(val, size); + *p_dvhandle = new dense_vector_handle(val, size); } template <typename fpType> -void init_dense_vector(sycl::queue& /*queue*/, - oneapi::mkl::sparse::dense_vector_handle_t* p_dvhandle, std::int64_t size, +void init_dense_vector(sycl::queue& /*queue*/, dense_vector_handle_t* p_dvhandle, std::int64_t size, fpType* val) { - *p_dvhandle = new oneapi::mkl::sparse::dense_vector_handle(val, size); -} - -template <typename fpType, typename InternalHandleT> -void check_can_reset_value_handle(const std::string& function_name, - InternalHandleT* internal_handle, bool expect_buffer) { - if (internal_handle->get_value_type() != detail::get_data_type<fpType>()) { - throw oneapi::mkl::invalid_argument( - "sparse_blas", function_name, - "Incompatible data types expected " + - data_type_to_str(internal_handle->get_value_type()) + " but got " + - data_type_to_str(detail::get_data_type<fpType>())); - } - if (internal_handle->all_use_buffer() != expect_buffer) { - throw oneapi::mkl::invalid_argument( - "sparse_blas", function_name, "Cannot change the container type between buffer or USM"); - } + *p_dvhandle = new dense_vector_handle(val, size); } template <typename fpType> -void set_dense_vector_data(sycl::queue& /*queue*/, - oneapi::mkl::sparse::dense_vector_handle_t dvhandle, std::int64_t size, - sycl::buffer<fpType, 1> val) { - check_can_reset_value_handle<fpType>(__func__, dvhandle, true); +void set_dense_vector_data(sycl::queue& /*queue*/, dense_vector_handle_t dvhandle, + std::int64_t size, sycl::buffer<fpType, 1> val) { + detail::check_can_reset_value_handle<fpType>(__func__, dvhandle, true); dvhandle->size = size; dvhandle->set_buffer(val); } template <typename fpType> -void set_dense_vector_data(sycl::queue& /*queue*/, - oneapi::mkl::sparse::dense_vector_handle_t dvhandle, std::int64_t size, - fpType* val) { - check_can_reset_value_handle<fpType>(__func__, dvhandle, false); +void set_dense_vector_data(sycl::queue& /*queue*/, dense_vector_handle_t dvhandle, + std::int64_t size, fpType* val) { + detail::check_can_reset_value_handle<fpType>(__func__, dvhandle, false); dvhandle->size = size; dvhandle->set_usm_ptr(val); } -#define INSTANTIATE_DENSE_VECTOR_FUNCS(FP_TYPE, FP_SUFFIX) \ - template void init_dense_vector<FP_TYPE>( \ - sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t * p_dvhandle, \ - std::int64_t size, sycl::buffer<FP_TYPE, 1> val); \ - template void init_dense_vector<FP_TYPE>( \ - sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t * p_dvhandle, \ - std::int64_t size, FP_TYPE* val); \ - template void set_dense_vector_data<FP_TYPE>( \ - sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle, \ - std::int64_t size, sycl::buffer<FP_TYPE, 1> val); \ - template void set_dense_vector_data<FP_TYPE>( \ - sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle, \ - std::int64_t size, FP_TYPE* val) FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_VECTOR_FUNCS); -#undef INSTANTIATE_DENSE_VECTOR_FUNCS -sycl::event release_dense_vector(sycl::queue& queue, - oneapi::mkl::sparse::dense_vector_handle_t dvhandle, +sycl::event release_dense_vector(sycl::queue& queue, dense_vector_handle_t dvhandle, const std::vector<sycl::event>& dependencies) { return detail::submit_release(queue, dvhandle, dependencies); } // Dense matrix template <typename fpType> -void init_dense_matrix(sycl::queue& /*queue*/, - oneapi::mkl::sparse::dense_matrix_handle_t* p_dmhandle, +void init_dense_matrix(sycl::queue& /*queue*/, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout, sycl::buffer<fpType, 1> val) { - *p_dmhandle = - new oneapi::mkl::sparse::dense_matrix_handle(val, num_rows, num_cols, ld, dense_layout); + *p_dmhandle = new dense_matrix_handle(val, num_rows, num_cols, ld, dense_layout); } template <typename fpType> -void init_dense_matrix(sycl::queue& /*queue*/, - oneapi::mkl::sparse::dense_matrix_handle_t* p_dmhandle, +void init_dense_matrix(sycl::queue& /*queue*/, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout, fpType* val) { - *p_dmhandle = - new oneapi::mkl::sparse::dense_matrix_handle(val, num_rows, num_cols, ld, dense_layout); + *p_dmhandle = new dense_matrix_handle(val, num_rows, num_cols, ld, dense_layout); } template <typename fpType> -void set_dense_matrix_data(sycl::queue& /*queue*/, - oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, +void set_dense_matrix_data(sycl::queue& /*queue*/, dense_matrix_handle_t dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout, sycl::buffer<fpType, 1> val) { - check_can_reset_value_handle<fpType>(__func__, dmhandle, true); + detail::check_can_reset_value_handle<fpType>(__func__, dmhandle, true); dmhandle->num_rows = num_rows; dmhandle->num_cols = num_cols; dmhandle->ld = ld; @@ -121,11 +83,10 @@ void set_dense_matrix_data(sycl::queue& /*queue*/, } template <typename fpType> -void set_dense_matrix_data(sycl::queue& /*queue*/, - oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, +void set_dense_matrix_data(sycl::queue& /*queue*/, dense_matrix_handle_t dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout, fpType* val) { - check_can_reset_value_handle<fpType>(__func__, dmhandle, false); + detail::check_can_reset_value_handle<fpType>(__func__, dmhandle, false); dmhandle->num_rows = num_rows; dmhandle->num_cols = num_cols; dmhandle->ld = ld; @@ -133,28 +94,9 @@ void set_dense_matrix_data(sycl::queue& /*queue*/, dmhandle->set_usm_ptr(val); } -#define INSTANTIATE_DENSE_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX) \ - template void init_dense_matrix<FP_TYPE>( \ - sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t * p_dmhandle, \ - std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ - oneapi::mkl::layout dense_layout, sycl::buffer<FP_TYPE, 1> val); \ - template void init_dense_matrix<FP_TYPE>( \ - sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t * p_dmhandle, \ - std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ - oneapi::mkl::layout dense_layout, FP_TYPE* val); \ - template void set_dense_matrix_data<FP_TYPE>( \ - sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, \ - std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ - oneapi::mkl::layout dense_layout, sycl::buffer<FP_TYPE, 1> val); \ - template void set_dense_matrix_data<FP_TYPE>( \ - sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, \ - std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ - oneapi::mkl::layout dense_layout, FP_TYPE* val) FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_MATRIX_FUNCS); -#undef INSTANTIATE_DENSE_MATRIX_FUNCS -sycl::event release_dense_matrix(sycl::queue& queue, - oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, +sycl::event release_dense_matrix(sycl::queue& queue, dense_matrix_handle_t dmhandle, const std::vector<sycl::event>& dependencies) { return detail::submit_release(queue, dmhandle, dependencies); } @@ -167,7 +109,9 @@ void init_coo_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t* p sycl::buffer<intType, 1> col_ind, sycl::buffer<fpType, 1> val) { oneapi::mkl::sparse::matrix_handle_t mkl_handle; oneapi::mkl::sparse::init_matrix_handle(&mkl_handle); - auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val); + auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val, + detail::sparse_format::COO, num_rows, + num_cols, nnz, index); // The backend handle must use the buffers from the internal handle as they will be kept alive until the handle is released. oneapi::mkl::sparse::set_coo_data(queue, mkl_handle, static_cast<intType>(num_rows), static_cast<intType>(num_cols), static_cast<intType>(nnz), @@ -184,7 +128,9 @@ void init_coo_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t* p fpType* val) { oneapi::mkl::sparse::matrix_handle_t mkl_handle; oneapi::mkl::sparse::init_matrix_handle(&mkl_handle); - auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val); + auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val, + detail::sparse_format::COO, num_rows, + num_cols, nnz, index); auto event = oneapi::mkl::sparse::set_coo_data( queue, mkl_handle, static_cast<intType>(num_rows), static_cast<intType>(num_cols), static_cast<intType>(nnz), index, row_ind, col_ind, val); @@ -192,32 +138,17 @@ void init_coo_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t* p *p_smhandle = reinterpret_cast<oneapi::mkl::sparse::matrix_handle_t>(internal_smhandle); } -template <typename fpType, typename intType> -void check_can_reset_sparse_handle(const std::string& function_name, - detail::sparse_matrix_handle* internal_smhandle, - bool expect_buffer) { - check_can_reset_value_handle<fpType>(function_name, internal_smhandle, expect_buffer); - if (internal_smhandle->get_int_type() != detail::get_data_type<intType>()) { - throw oneapi::mkl::invalid_argument( - "sparse_blas", function_name, - "Incompatible data types expected " + - data_type_to_str(internal_smhandle->get_int_type()) + " but got " + - data_type_to_str(detail::get_data_type<intType>())); - } - if (!internal_smhandle->can_be_reset) { - throw mkl::unimplemented( - "sparse_blas/mkl", function_name, - "Reseting the matrix handle's data after it was used in a computation is not supported."); - } -} - template <typename fpType, typename intType> void set_coo_matrix_data(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, sycl::buffer<intType, 1> row_ind, sycl::buffer<intType, 1> col_ind, sycl::buffer<fpType, 1> val) { auto internal_smhandle = detail::get_internal_handle(smhandle); - check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, true); + detail::check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, true); + internal_smhandle->num_rows = num_rows; + internal_smhandle->num_cols = num_cols; + internal_smhandle->nnz = nnz; + internal_smhandle->index = index; internal_smhandle->row_container.set_buffer(row_ind); internal_smhandle->col_container.set_buffer(col_ind); internal_smhandle->value_container.set_buffer(val); @@ -236,7 +167,11 @@ void set_coo_matrix_data(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_ oneapi::mkl::index_base index, intType* row_ind, intType* col_ind, fpType* val) { auto internal_smhandle = detail::get_internal_handle(smhandle); - check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, false); + detail::check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, false); + internal_smhandle->num_rows = num_rows; + internal_smhandle->num_cols = num_cols; + internal_smhandle->nnz = nnz; + internal_smhandle->index = index; internal_smhandle->row_container.set_usm_ptr(row_ind); internal_smhandle->col_container.set_usm_ptr(col_ind); internal_smhandle->value_container.set_usm_ptr(val); @@ -246,37 +181,19 @@ void set_coo_matrix_data(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_ event.wait_and_throw(); } -#define INSTANTIATE_COO_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX) \ - template void init_coo_matrix<FP_TYPE, INT_TYPE>( \ - sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ - std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ - oneapi::mkl::index_base index, sycl::buffer<INT_TYPE, 1> row_ind, \ - sycl::buffer<INT_TYPE, 1> col_ind, sycl::buffer<FP_TYPE, 1> val); \ - template void init_coo_matrix<FP_TYPE, INT_TYPE>( \ - sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ - std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ - oneapi::mkl::index_base index, INT_TYPE* row_ind, INT_TYPE* col_ind, FP_TYPE* val); \ - template void set_coo_matrix_data<FP_TYPE, INT_TYPE>( \ - sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ - std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ - sycl::buffer<INT_TYPE, 1> row_ind, sycl::buffer<INT_TYPE, 1> col_ind, \ - sycl::buffer<FP_TYPE, 1> val); \ - template void set_coo_matrix_data<FP_TYPE, INT_TYPE>( \ - sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ - std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, INT_TYPE* row_ind, \ - INT_TYPE* col_ind, FP_TYPE* val) FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_COO_MATRIX_FUNCS); -#undef INSTANTIATE_COO_MATRIX_FUNCS // CSR matrix template <typename fpType, typename intType> void init_csr_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t* p_smhandle, - std::int64_t num_rows, std::int64_t num_cols, std::int64_t /*nnz*/, + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, sycl::buffer<intType, 1> row_ptr, sycl::buffer<intType, 1> col_ind, sycl::buffer<fpType, 1> val) { oneapi::mkl::sparse::matrix_handle_t mkl_handle; oneapi::mkl::sparse::init_matrix_handle(&mkl_handle); - auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val); + auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val, + detail::sparse_format::CSR, num_rows, + num_cols, nnz, index); // The backend deduces nnz from row_ptr. // The backend handle must use the buffers from the internal handle as they will be kept alive until the handle is released. oneapi::mkl::sparse::set_csr_data(queue, mkl_handle, static_cast<intType>(num_rows), @@ -289,12 +206,14 @@ void init_csr_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t* p template <typename fpType, typename intType> void init_csr_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t* p_smhandle, - std::int64_t num_rows, std::int64_t num_cols, std::int64_t /*nnz*/, + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, intType* row_ptr, intType* col_ind, fpType* val) { oneapi::mkl::sparse::matrix_handle_t mkl_handle; oneapi::mkl::sparse::init_matrix_handle(&mkl_handle); - auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val); + auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val, + detail::sparse_format::CSR, num_rows, + num_cols, nnz, index); // The backend deduces nnz from row_ptr. auto event = oneapi::mkl::sparse::set_csr_data( queue, mkl_handle, static_cast<intType>(num_rows), static_cast<intType>(num_cols), index, @@ -305,11 +224,15 @@ void init_csr_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t* p template <typename fpType, typename intType> void set_csr_matrix_data(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t smhandle, - std::int64_t num_rows, std::int64_t num_cols, std::int64_t /*nnz*/, + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, sycl::buffer<intType, 1> row_ptr, sycl::buffer<intType, 1> col_ind, sycl::buffer<fpType, 1> val) { auto internal_smhandle = detail::get_internal_handle(smhandle); - check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, true); + detail::check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, true); + internal_smhandle->num_rows = num_rows; + internal_smhandle->num_cols = num_cols; + internal_smhandle->nnz = nnz; + internal_smhandle->index = index; internal_smhandle->row_container.set_buffer(row_ptr); internal_smhandle->col_container.set_buffer(col_ind); internal_smhandle->value_container.set_buffer(val); @@ -325,11 +248,15 @@ void set_csr_matrix_data(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_ template <typename fpType, typename intType> void set_csr_matrix_data(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t smhandle, - std::int64_t num_rows, std::int64_t num_cols, std::int64_t /*nnz*/, + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, intType* row_ptr, intType* col_ind, fpType* val) { auto internal_smhandle = detail::get_internal_handle(smhandle); - check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, false); + detail::check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, false); + internal_smhandle->num_rows = num_rows; + internal_smhandle->num_cols = num_cols; + internal_smhandle->nnz = nnz; + internal_smhandle->index = index; internal_smhandle->row_container.set_usm_ptr(row_ptr); internal_smhandle->col_container.set_usm_ptr(col_ind); internal_smhandle->value_container.set_usm_ptr(val); @@ -340,27 +267,7 @@ void set_csr_matrix_data(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_ event.wait_and_throw(); } -#define INSTANTIATE_CSR_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX) \ - template void init_csr_matrix<FP_TYPE, INT_TYPE>( \ - sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ - std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ - oneapi::mkl::index_base index, sycl::buffer<INT_TYPE, 1> row_ptr, \ - sycl::buffer<INT_TYPE, 1> col_ind, sycl::buffer<FP_TYPE, 1> val); \ - template void init_csr_matrix<FP_TYPE, INT_TYPE>( \ - sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ - std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ - oneapi::mkl::index_base index, INT_TYPE* row_ptr, INT_TYPE* col_ind, FP_TYPE* val); \ - template void set_csr_matrix_data<FP_TYPE, INT_TYPE>( \ - sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ - std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ - sycl::buffer<INT_TYPE, 1> row_ptr, sycl::buffer<INT_TYPE, 1> col_ind, \ - sycl::buffer<FP_TYPE, 1> val); \ - template void set_csr_matrix_data<FP_TYPE, INT_TYPE>( \ - sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ - std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, INT_TYPE* row_ptr, \ - INT_TYPE* col_ind, FP_TYPE* val) FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_CSR_MATRIX_FUNCS); -#undef INSTANTIATE_CSR_MATRIX_FUNCS // Common sparse matrix functions sycl::event release_sparse_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t smhandle, @@ -369,22 +276,22 @@ sycl::event release_sparse_matrix(sycl::queue& queue, oneapi::mkl::sparse::matri // Asynchronously release the backend's handle followed by the internal handle. auto event = oneapi::mkl::sparse::release_matrix_handle( queue, &internal_smhandle->backend_handle, dependencies); - return detail::submit_release(queue, internal_smhandle, event); + return detail::submit_release(queue, internal_smhandle, { event }); } bool set_matrix_property(sycl::queue& /*queue*/, oneapi::mkl::sparse::matrix_handle_t smhandle, - oneapi::mkl::sparse::matrix_property property) { + matrix_property property) { auto internal_smhandle = detail::get_internal_handle(smhandle); // Store the matrix property internally for better error checking internal_smhandle->set_matrix_property(property); // Set the matrix property on the backend handle // Backend and oneMKL interface types for the property don't match switch (property) { - case oneapi::mkl::sparse::matrix_property::symmetric: + case matrix_property::symmetric: oneapi::mkl::sparse::set_matrix_property(internal_smhandle->backend_handle, oneapi::mkl::sparse::property::symmetric); return true; - case oneapi::mkl::sparse::matrix_property::sorted: + case matrix_property::sorted: oneapi::mkl::sparse::set_matrix_property(internal_smhandle->backend_handle, oneapi::mkl::sparse::property::sorted); return true; diff --git a/src/sparse_blas/backends/mkl_common/mkl_handles.hpp b/src/sparse_blas/backends/mkl_common/mkl_handles.hpp index 44b12e8df..1bce0b8fb 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_handles.hpp +++ b/src/sparse_blas/backends/mkl_common/mkl_handles.hpp @@ -26,6 +26,8 @@ #include <oneapi/mkl/spblas.hpp> #include "sparse_blas/generic_container.hpp" +#include "sparse_blas/macros.hpp" +#include "sparse_blas/sycl_helper.hpp" namespace oneapi::mkl::sparse { diff --git a/src/sparse_blas/backends/mkl_common/mkl_helper.hpp b/src/sparse_blas/backends/mkl_common/mkl_helper.hpp deleted file mode 100644 index 99dc6707d..000000000 --- a/src/sparse_blas/backends/mkl_common/mkl_helper.hpp +++ /dev/null @@ -1,111 +0,0 @@ -/*************************************************************************** -* Copyright (C) Codeplay Software Limited -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* For your convenience, a copy of the License has been included in this -* repository. -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -**************************************************************************/ - -#ifndef _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_MKL_COMMON_MKL_HELPER_HPP_ -#define _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_MKL_COMMON_MKL_HELPER_HPP_ - -#if __has_include(<sycl/sycl.hpp>) -#include <sycl/sycl.hpp> -#else -#include <CL/sycl.hpp> -#endif - -#include "oneapi/mkl/exceptions.hpp" -#include "oneapi/mkl/sparse_blas/detail/helper_types.hpp" - -#include "sparse_blas/enum_data_types.hpp" -#include "sparse_blas/macros.hpp" - -namespace oneapi::mkl::sparse::detail { - -/// Return whether a pointer is accessible on the host -template <typename T> -inline bool is_ptr_accessible_on_host(sycl::queue& queue, const T* host_or_device_ptr) { - auto alloc_type = sycl::get_pointer_type(host_or_device_ptr, queue.get_context()); - return alloc_type == sycl::usm::alloc::host || alloc_type == sycl::usm::alloc::shared || - alloc_type == sycl::usm::alloc::unknown; -} - -/// Throw an exception if the scalar is not accessible in the host -inline void check_ptr_is_host_accessible(const std::string& function_name, - const std::string& scalar_name, - bool is_ptr_accessible_on_host) { - if (!is_ptr_accessible_on_host) { - throw mkl::invalid_argument( - "sparse_blas", function_name, - "Scalar " + scalar_name + " must be accessible on the host for buffer functions."); - } -} - -/// Return a scalar on the host from a pointer to host or device memory -/// Used for USM functions -template <typename T> -inline T get_scalar_on_host(sycl::queue& queue, const T* host_or_device_ptr, - bool is_ptr_accessible_on_host) { - if (is_ptr_accessible_on_host) { - return *host_or_device_ptr; - } - T scalar; - auto event = queue.copy(host_or_device_ptr, &scalar, 1); - event.wait_and_throw(); - return scalar; -} - -/// Merge multiple event dependencies into one -inline sycl::event collapse_dependencies(sycl::queue& queue, - const std::vector<sycl::event>& dependencies) { - if (dependencies.empty()) { - return {}; - } - else if (dependencies.size() == 1) { - return dependencies[0]; - } - - return queue.submit([&](sycl::handler& cgh) { - cgh.depends_on(dependencies); - cgh.host_task([=]() {}); - }); -} - -/// Convert \p value_type to template type argument and use it to call \p op_functor. -#define DISPATCH_MKL_OPERATION(function_name, value_type, op_functor, ...) \ - switch (value_type) { \ - case detail::data_type::real_fp32: return op_functor<float>(__VA_ARGS__); \ - case detail::data_type::real_fp64: return op_functor<double>(__VA_ARGS__); \ - case detail::data_type::complex_fp32: return op_functor<std::complex<float>>(__VA_ARGS__); \ - case detail::data_type::complex_fp64: \ - return op_functor<std::complex<double>>(__VA_ARGS__); \ - default: \ - throw oneapi::mkl::exception( \ - "sparse_blas", function_name, \ - "Internal error: unsupported type " + data_type_to_str(value_type)); \ - } - -#define CHECK_DESCR_MATCH(descr, argument, optimize_func_name) \ - do { \ - if (descr->last_optimized_##argument != argument) { \ - throw mkl::invalid_argument( \ - "sparse_blas", __func__, \ - #argument " argument must match with the previous call to " #optimize_func_name); \ - } \ - } while (0) - -} // namespace oneapi::mkl::sparse::detail - -#endif // _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_MKL_COMMON_MKL_HELPER_HPP_ diff --git a/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx b/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx index 49987a202..9c0bc577b 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx +++ b/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx @@ -17,6 +17,8 @@ * **************************************************************************/ +// In this file functions and types using the namespace oneapi::mkl::sparse:: refer to the backend's namespace for better readability. + namespace oneapi::mkl::sparse { struct spmm_descr { @@ -24,68 +26,40 @@ struct spmm_descr { bool optimized_called = false; oneapi::mkl::transpose last_optimized_opA; oneapi::mkl::transpose last_optimized_opB; - oneapi::mkl::sparse::matrix_view last_optimized_A_view; - oneapi::mkl::sparse::matrix_handle_t last_optimized_A_handle; - oneapi::mkl::sparse::dense_matrix_handle_t last_optimized_B_handle; - oneapi::mkl::sparse::dense_matrix_handle_t last_optimized_C_handle; - oneapi::mkl::sparse::spmm_alg last_optimized_alg; + matrix_view last_optimized_A_view; + matrix_handle_t last_optimized_A_handle; + dense_matrix_handle_t last_optimized_B_handle; + dense_matrix_handle_t last_optimized_C_handle; + spmm_alg last_optimized_alg; }; } // namespace oneapi::mkl::sparse namespace oneapi::mkl::sparse::BACKEND { -void init_spmm_descr(sycl::queue& /*queue*/, oneapi::mkl::sparse::spmm_descr_t* p_spmm_descr) { +void init_spmm_descr(sycl::queue& /*queue*/, spmm_descr_t* p_spmm_descr) { *p_spmm_descr = new spmm_descr(); } -sycl::event release_spmm_descr(sycl::queue& queue, oneapi::mkl::sparse::spmm_descr_t spmm_descr, +sycl::event release_spmm_descr(sycl::queue& queue, spmm_descr_t spmm_descr, const std::vector<sycl::event>& dependencies) { return detail::submit_release(queue, spmm_descr, dependencies); } void check_valid_spmm(const std::string& function_name, oneapi::mkl::transpose opA, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, - bool is_alpha_host_accessible, bool is_beta_host_accessible) { - THROW_IF_NULLPTR(function_name, A_handle); - THROW_IF_NULLPTR(function_name, B_handle); - THROW_IF_NULLPTR(function_name, C_handle); - + matrix_view A_view, matrix_handle_t A_handle, dense_matrix_handle_t B_handle, + dense_matrix_handle_t C_handle, bool is_alpha_host_accessible, + bool is_beta_host_accessible) { auto internal_A_handle = detail::get_internal_handle(A_handle); - detail::check_all_containers_compatible(function_name, internal_A_handle, B_handle, C_handle); - if (internal_A_handle->all_use_buffer()) { - detail::check_ptr_is_host_accessible("spmm", "alpha", is_alpha_host_accessible); - detail::check_ptr_is_host_accessible("spmm", "beta", is_beta_host_accessible); - } - if (is_alpha_host_accessible != is_beta_host_accessible) { - throw mkl::invalid_argument( - "sparse_blas", function_name, - "Alpha and beta must both be placed on host memory or device memory."); - } - if (B_handle->dense_layout != C_handle->dense_layout) { - throw mkl::invalid_argument("sparse_blas", function_name, - "B and C matrices must used the same layout."); - } - - if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::general) { - throw mkl::invalid_argument("sparse_blas", function_name, - "Matrix view's type must be `matrix_descr::general`."); - } - - if (A_view.diag_view != oneapi::mkl::diag::nonunit) { - throw mkl::invalid_argument("sparse_blas", function_name, - "Matrix's diag_view must be `nonunit`."); - } + detail::check_valid_spmm_common(function_name, A_view, internal_A_handle, B_handle, C_handle, + is_alpha_host_accessible, is_beta_host_accessible); #if BACKEND == gpu detail::data_type data_type = internal_A_handle->get_value_type(); if ((data_type == detail::data_type::complex_fp32 || data_type == detail::data_type::complex_fp64) && opA == oneapi::mkl::transpose::conjtrans && - internal_A_handle->has_matrix_property(oneapi::mkl::sparse::matrix_property::symmetric)) { + internal_A_handle->has_matrix_property(matrix_property::symmetric)) { throw mkl::unimplemented( "sparse_blas", function_name, "The backend does not support spmm using conjtrans and the symmetric property."); @@ -96,13 +70,10 @@ void check_valid_spmm(const std::string& function_name, oneapi::mkl::transpose o } void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, - oneapi::mkl::transpose /*opB*/, const void* alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, - oneapi::mkl::sparse::spmm_alg /*alg*/, - oneapi::mkl::sparse::spmm_descr_t spmm_descr, std::size_t& temp_buffer_size) { + oneapi::mkl::transpose /*opB*/, const void* alpha, matrix_view A_view, + matrix_handle_t A_handle, dense_matrix_handle_t B_handle, const void* beta, + dense_matrix_handle_t C_handle, spmm_alg /*alg*/, spmm_descr_t spmm_descr, + std::size_t& temp_buffer_size) { // TODO: Add support for external workspace once the close-source oneMKL backend supports it. bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); @@ -112,12 +83,11 @@ void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, spmm_descr->buffer_size_called = true; } -inline void common_spmm_optimize( - sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, const void* alpha, - oneapi::mkl::sparse::matrix_view A_view, oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, oneapi::mkl::sparse::spmm_alg alg, - oneapi::mkl::sparse::spmm_descr_t spmm_descr) { +inline void common_spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view, + matrix_handle_t A_handle, dense_matrix_handle_t B_handle, + const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, + spmm_descr_t spmm_descr) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); check_valid_spmm("spmm_optimize", opA, A_view, A_handle, B_handle, C_handle, @@ -137,11 +107,9 @@ inline void common_spmm_optimize( } void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void* alpha, oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, - oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::spmm_descr_t spmm_descr, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, + spmm_alg alg, spmm_descr_t spmm_descr, sycl::buffer<std::uint8_t, 1> /*workspace*/) { auto internal_A_handle = detail::get_internal_handle(A_handle); if (!internal_A_handle->all_use_buffer()) { @@ -149,7 +117,7 @@ void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl:: } common_spmm_optimize(queue, opA, opB, alpha, A_view, A_handle, B_handle, beta, C_handle, alg, spmm_descr); - if (alg == oneapi::mkl::sparse::spmm_alg::no_optimize_alg) { + if (alg == spmm_alg::no_optimize_alg) { return; } internal_A_handle->can_be_reset = false; @@ -157,13 +125,10 @@ void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl:: } sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, - oneapi::mkl::transpose opB, const void* alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, - oneapi::mkl::sparse::spmm_alg alg, - oneapi::mkl::sparse::spmm_descr_t spmm_descr, void* /*workspace*/, + oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view, + matrix_handle_t A_handle, dense_matrix_handle_t B_handle, + const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, + spmm_descr_t spmm_descr, void* /*workspace*/, const std::vector<sycl::event>& dependencies) { auto internal_A_handle = detail::get_internal_handle(A_handle); if (internal_A_handle->all_use_buffer()) { @@ -171,7 +136,7 @@ sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, } common_spmm_optimize(queue, opA, opB, alpha, A_view, A_handle, B_handle, beta, C_handle, alg, spmm_descr); - if (alg == oneapi::mkl::sparse::spmm_alg::no_optimize_alg) { + if (alg == spmm_alg::no_optimize_alg) { return detail::collapse_dependencies(queue, dependencies); } internal_A_handle->can_be_reset = false; @@ -180,13 +145,12 @@ sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, } template <typename T> -sycl::event internal_spmm( - sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, const void* alpha, - oneapi::mkl::sparse::matrix_view /*A_view*/, oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, oneapi::mkl::sparse::spmm_alg /*alg*/, - oneapi::mkl::sparse::spmm_descr_t /*spmm_descr*/, const std::vector<sycl::event>& dependencies, - bool is_alpha_host_accessible, bool is_beta_host_accessible) { +sycl::event internal_spmm(sycl::queue& queue, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void* alpha, matrix_view /*A_view*/, + matrix_handle_t A_handle, dense_matrix_handle_t B_handle, + const void* beta, dense_matrix_handle_t C_handle, spmm_alg /*alg*/, + spmm_descr_t /*spmm_descr*/, const std::vector<sycl::event>& dependencies, + bool is_alpha_host_accessible, bool is_beta_host_accessible) { T host_alpha = detail::get_scalar_on_host(queue, static_cast<const T*>(alpha), is_alpha_host_accessible); T host_beta = @@ -213,11 +177,9 @@ sycl::event internal_spmm( } sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void* alpha, oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, - oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::spmm_descr_t spmm_descr, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, + spmm_alg alg, spmm_descr_t spmm_descr, const std::vector<sycl::event>& dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); diff --git a/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx b/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx index d5a24e9f4..9fc43d8e9 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx +++ b/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx @@ -17,67 +17,44 @@ * **************************************************************************/ +// In this file functions and types using the namespace oneapi::mkl::sparse:: refer to the backend's namespace for better readability. + namespace oneapi::mkl::sparse { struct spmv_descr { bool buffer_size_called = false; bool optimized_called = false; oneapi::mkl::transpose last_optimized_opA; - oneapi::mkl::sparse::matrix_view last_optimized_A_view; - oneapi::mkl::sparse::matrix_handle_t last_optimized_A_handle; - oneapi::mkl::sparse::dense_vector_handle_t last_optimized_x_handle; - oneapi::mkl::sparse::dense_vector_handle_t last_optimized_y_handle; - oneapi::mkl::sparse::spmv_alg last_optimized_alg; + matrix_view last_optimized_A_view; + matrix_handle_t last_optimized_A_handle; + dense_vector_handle_t last_optimized_x_handle; + dense_vector_handle_t last_optimized_y_handle; + spmv_alg last_optimized_alg; }; } // namespace oneapi::mkl::sparse namespace oneapi::mkl::sparse::BACKEND { -void init_spmv_descr(sycl::queue& /*queue*/, oneapi::mkl::sparse::spmv_descr_t* p_spmv_descr) { +void init_spmv_descr(sycl::queue& /*queue*/, spmv_descr_t* p_spmv_descr) { *p_spmv_descr = new spmv_descr(); } -sycl::event release_spmv_descr(sycl::queue& queue, oneapi::mkl::sparse::spmv_descr_t spmv_descr, +sycl::event release_spmv_descr(sycl::queue& queue, spmv_descr_t spmv_descr, const std::vector<sycl::event>& dependencies) { return detail::submit_release(queue, spmv_descr, dependencies); } void check_valid_spmv(const std::string& function_name, oneapi::mkl::transpose opA, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - bool is_alpha_host_accessible, bool is_beta_host_accessible) { - THROW_IF_NULLPTR(function_name, A_handle); - THROW_IF_NULLPTR(function_name, x_handle); - THROW_IF_NULLPTR(function_name, y_handle); - + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, bool is_alpha_host_accessible, + bool is_beta_host_accessible) { auto internal_A_handle = detail::get_internal_handle(A_handle); - detail::check_all_containers_compatible(function_name, internal_A_handle, x_handle, y_handle); - if (internal_A_handle->all_use_buffer()) { - detail::check_ptr_is_host_accessible("spmv", "alpha", is_alpha_host_accessible); - detail::check_ptr_is_host_accessible("spmv", "beta", is_beta_host_accessible); - } - if (is_alpha_host_accessible != is_beta_host_accessible) { - throw mkl::invalid_argument( - "sparse_blas", function_name, - "Alpha and beta must both be placed on host memory or device memory."); - } - if (A_view.type_view == oneapi::mkl::sparse::matrix_descr::diagonal) { - throw mkl::invalid_argument("sparse_blas", function_name, - "Matrix view's type cannot be diagonal."); - } - - if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::triangular && - A_view.diag_view == oneapi::mkl::diag::unit) { - throw mkl::invalid_argument( - "sparse_blas", function_name, - "`unit` diag_view can only be used with a triangular type_view."); - } + detail::check_valid_spmv_common(__func__, opA, A_view, internal_A_handle, x_handle, y_handle, + is_alpha_host_accessible, is_beta_host_accessible); - if ((A_view.type_view == oneapi::mkl::sparse::matrix_descr::symmetric || - A_view.type_view == oneapi::mkl::sparse::matrix_descr::hermitian) && + if ((A_view.type_view == matrix_descr::symmetric || + A_view.type_view == matrix_descr::hermitian) && opA == oneapi::mkl::transpose::conjtrans) { throw mkl::unimplemented( "sparse_blas", function_name, @@ -86,12 +63,9 @@ void check_valid_spmv(const std::string& function_name, oneapi::mkl::transpose o } void spmv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void* beta, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spmv_alg /*alg*/, - oneapi::mkl::sparse::spmv_descr_t spmv_descr, std::size_t& temp_buffer_size) { + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + const void* beta, dense_vector_handle_t y_handle, spmv_alg /*alg*/, + spmv_descr_t spmv_descr, std::size_t& temp_buffer_size) { // TODO: Add support for external workspace once the close-source oneMKL backend supports it. bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); @@ -102,13 +76,10 @@ void spmv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void } inline void common_spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - const void* beta, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spmv_alg alg, - oneapi::mkl::sparse::spmv_descr_t spmv_descr) { + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, const void* beta, + dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); check_valid_spmv("spmv_optimize", opA, A_view, A_handle, x_handle, y_handle, @@ -127,19 +98,16 @@ inline void common_spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, } void spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void* beta, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::spmv_descr_t spmv_descr, - sycl::buffer<std::uint8_t, 1> /*workspace*/) { + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, sycl::buffer<std::uint8_t, 1> /*workspace*/) { auto internal_A_handle = detail::get_internal_handle(A_handle); if (!internal_A_handle->all_use_buffer()) { detail::throw_incompatible_container(__func__); } common_spmv_optimize(queue, opA, alpha, A_view, A_handle, x_handle, beta, y_handle, alg, spmv_descr); - if (alg == oneapi::mkl::sparse::spmv_alg::no_optimize_alg) { + if (alg == spmv_alg::no_optimize_alg) { return; } internal_A_handle->can_be_reset = false; @@ -158,20 +126,17 @@ void spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* a } sycl::event spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void* beta, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spmv_alg alg, - oneapi::mkl::sparse::spmv_descr_t spmv_descr, void* /*workspace*/, - const std::vector<sycl::event>& dependencies) { + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, const void* beta, + dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr, + void* /*workspace*/, const std::vector<sycl::event>& dependencies) { auto internal_A_handle = detail::get_internal_handle(A_handle); if (internal_A_handle->all_use_buffer()) { detail::throw_incompatible_container(__func__); } common_spmv_optimize(queue, opA, alpha, A_view, A_handle, x_handle, beta, y_handle, alg, spmv_descr); - if (alg == oneapi::mkl::sparse::spmv_alg::no_optimize_alg) { + if (alg == spmv_alg::no_optimize_alg) { return detail::collapse_dependencies(queue, dependencies); } internal_A_handle->can_be_reset = false; @@ -191,13 +156,10 @@ sycl::event spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const template <typename T> sycl::event internal_spmv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void* beta, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spmv_alg /*alg*/, - oneapi::mkl::sparse::spmv_descr_t /*spmv_descr*/, - const std::vector<sycl::event>& dependencies, + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, const void* beta, + dense_vector_handle_t y_handle, spmv_alg /*alg*/, + spmv_descr_t /*spmv_descr*/, const std::vector<sycl::event>& dependencies, bool is_alpha_host_accessible, bool is_beta_host_accessible) { T host_alpha = detail::get_scalar_on_host(queue, static_cast<const T*>(alpha), is_alpha_host_accessible); @@ -246,12 +208,9 @@ sycl::event internal_spmv(sycl::queue& queue, oneapi::mkl::transpose opA, const } sycl::event spmv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void* beta, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::spmv_descr_t spmv_descr, - const std::vector<sycl::event>& dependencies) { + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, const std::vector<sycl::event>& dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, diff --git a/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx b/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx index f73000340..dd2a4f627 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx +++ b/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx @@ -17,45 +17,43 @@ * **************************************************************************/ +// In this file functions and types using the namespace oneapi::mkl::sparse:: refer to the backend's namespace for better readability. + namespace oneapi::mkl::sparse { struct spsv_descr { bool buffer_size_called = false; bool optimized_called = false; oneapi::mkl::transpose last_optimized_opA; - oneapi::mkl::sparse::matrix_view last_optimized_A_view; - oneapi::mkl::sparse::matrix_handle_t last_optimized_A_handle; - oneapi::mkl::sparse::dense_vector_handle_t last_optimized_x_handle; - oneapi::mkl::sparse::dense_vector_handle_t last_optimized_y_handle; - oneapi::mkl::sparse::spsv_alg last_optimized_alg; + matrix_view last_optimized_A_view; + matrix_handle_t last_optimized_A_handle; + dense_vector_handle_t last_optimized_x_handle; + dense_vector_handle_t last_optimized_y_handle; + spsv_alg last_optimized_alg; }; } // namespace oneapi::mkl::sparse namespace oneapi::mkl::sparse::BACKEND { -void init_spsv_descr(sycl::queue& /*queue*/, oneapi::mkl::sparse::spsv_descr_t* p_spsv_descr) { +void init_spsv_descr(sycl::queue& /*queue*/, spsv_descr_t* p_spsv_descr) { *p_spsv_descr = new spsv_descr(); } -sycl::event release_spsv_descr(sycl::queue& queue, oneapi::mkl::sparse::spsv_descr_t spsv_descr, +sycl::event release_spsv_descr(sycl::queue& queue, spsv_descr_t spsv_descr, const std::vector<sycl::event>& dependencies) { return detail::submit_release(queue, spsv_descr, dependencies); } void check_valid_spsv(const std::string& function_name, oneapi::mkl::transpose opA, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - bool is_alpha_host_accessible, oneapi::mkl::sparse::spsv_alg alg) { - THROW_IF_NULLPTR(function_name, A_handle); - THROW_IF_NULLPTR(function_name, x_handle); - THROW_IF_NULLPTR(function_name, y_handle); - + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, bool is_alpha_host_accessible, spsv_alg alg) { auto internal_A_handle = detail::get_internal_handle(A_handle); - if (alg == oneapi::mkl::sparse::spsv_alg::no_optimize_alg && - !internal_A_handle->has_matrix_property(oneapi::mkl::sparse::matrix_property::sorted)) { + detail::check_valid_spsv_common(function_name, A_view, internal_A_handle, x_handle, y_handle, + is_alpha_host_accessible); + + if (alg == spsv_alg::no_optimize_alg && + !internal_A_handle->has_matrix_property(matrix_property::sorted)) { throw mkl::unimplemented( "sparse_blas", function_name, "The backend does not support `no_optimize_alg` unless A_handle has the property `matrix_property::sorted`."); @@ -72,25 +70,12 @@ void check_valid_spsv(const std::string& function_name, oneapi::mkl::transpose o #else (void)opA; #endif // BACKEND - - detail::check_all_containers_compatible(function_name, internal_A_handle, x_handle, y_handle); - if (A_view.type_view != matrix_descr::triangular) { - throw mkl::invalid_argument("sparse_blas", function_name, - "Matrix view's type must be `matrix_descr::triangular`."); - } - - if (internal_A_handle->all_use_buffer()) { - detail::check_ptr_is_host_accessible("spsv", "alpha", is_alpha_host_accessible); - } } void spsv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spsv_alg alg, - oneapi::mkl::sparse::spsv_descr_t spsv_descr, std::size_t& temp_buffer_size) { + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, + std::size_t& temp_buffer_size) { // TODO: Add support for external workspace once the close-source oneMKL backend supports it. bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); check_valid_spsv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, @@ -100,12 +85,9 @@ void spsv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void } inline void common_spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spsv_alg alg, - oneapi::mkl::sparse::spsv_descr_t spsv_descr) { + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, + spsv_alg alg, spsv_descr_t spsv_descr) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); check_valid_spsv("spsv_optimize", opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, alg); @@ -123,18 +105,15 @@ inline void common_spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, } void spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::spsv_descr_t spsv_descr, + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, sycl::buffer<std::uint8_t, 1> /*workspace*/) { auto internal_A_handle = detail::get_internal_handle(A_handle); if (!internal_A_handle->all_use_buffer()) { detail::throw_incompatible_container(__func__); } common_spsv_optimize(queue, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, spsv_descr); - if (alg == oneapi::mkl::sparse::spsv_alg::no_optimize_alg) { + if (alg == spsv_alg::no_optimize_alg) { return; } internal_A_handle->can_be_reset = false; @@ -143,19 +122,16 @@ void spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* a } sycl::event spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spsv_alg alg, - oneapi::mkl::sparse::spsv_descr_t spsv_descr, void* /*workspace*/, + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, + spsv_alg alg, spsv_descr_t spsv_descr, void* /*workspace*/, const std::vector<sycl::event>& dependencies) { auto internal_A_handle = detail::get_internal_handle(A_handle); if (internal_A_handle->all_use_buffer()) { detail::throw_incompatible_container(__func__); } common_spsv_optimize(queue, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, spsv_descr); - if (alg == oneapi::mkl::sparse::spsv_alg::no_optimize_alg) { + if (alg == spsv_alg::no_optimize_alg) { return detail::collapse_dependencies(queue, dependencies); } internal_A_handle->can_be_reset = false; @@ -165,12 +141,9 @@ sycl::event spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const template <typename T> sycl::event internal_spsv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spsv_alg /*alg*/, - oneapi::mkl::sparse::spsv_descr_t /*spsv_descr*/, + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, + spsv_alg /*alg*/, spsv_descr_t /*spsv_descr*/, const std::vector<sycl::event>& dependencies, bool is_alpha_host_accessible) { T host_alpha = @@ -193,11 +166,8 @@ sycl::event internal_spsv(sycl::queue& queue, oneapi::mkl::transpose opA, const } sycl::event spsv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::spsv_descr_t spsv_descr, + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, const std::vector<sycl::event>& dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); check_valid_spsv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, diff --git a/src/sparse_blas/backends/mklcpu/mklcpu_handles.cpp b/src/sparse_blas/backends/mklcpu/mklcpu_handles.cpp index a6ea51629..0aaf91b25 100644 --- a/src/sparse_blas/backends/mklcpu/mklcpu_handles.cpp +++ b/src/sparse_blas/backends/mklcpu/mklcpu_handles.cpp @@ -19,7 +19,7 @@ #include "oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp" -#include "sparse_blas/backends/mkl_common/mkl_helper.hpp" +#include "sparse_blas/backends/mkl_common/mkl_dispatch.hpp" #include "sparse_blas/backends/mkl_common/mkl_handles.hpp" namespace oneapi::mkl::sparse::mklcpu { diff --git a/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp b/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp index 0929a7ef4..ebc8ceecf 100644 --- a/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp +++ b/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp @@ -17,10 +17,12 @@ * **************************************************************************/ +#include "sparse_blas/backends/mkl_common/mkl_dispatch.hpp" #include "sparse_blas/backends/mkl_common/mkl_handles.hpp" -#include "sparse_blas/backends/mkl_common/mkl_helper.hpp" +#include "sparse_blas/common_op_verification.hpp" #include "sparse_blas/macros.hpp" #include "sparse_blas/matrix_view_comparison.hpp" +#include "sparse_blas/sycl_helper.hpp" #include "oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp" diff --git a/src/sparse_blas/backends/mklgpu/mklgpu_handles.cpp b/src/sparse_blas/backends/mklgpu/mklgpu_handles.cpp index 7cb9853a7..648fed66e 100644 --- a/src/sparse_blas/backends/mklgpu/mklgpu_handles.cpp +++ b/src/sparse_blas/backends/mklgpu/mklgpu_handles.cpp @@ -19,8 +19,8 @@ #include "oneapi/mkl/sparse_blas/detail/mklgpu/onemkl_sparse_blas_mklgpu.hpp" +#include "sparse_blas/backends/mkl_common/mkl_dispatch.hpp" #include "sparse_blas/backends/mkl_common/mkl_handles.hpp" -#include "sparse_blas/backends/mkl_common/mkl_helper.hpp" namespace oneapi::mkl::sparse::mklgpu { diff --git a/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp b/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp index be5e0c0aa..1102306dc 100644 --- a/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp +++ b/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp @@ -17,10 +17,12 @@ * **************************************************************************/ +#include "sparse_blas/backends/mkl_common/mkl_dispatch.hpp" #include "sparse_blas/backends/mkl_common/mkl_handles.hpp" -#include "sparse_blas/backends/mkl_common/mkl_helper.hpp" +#include "sparse_blas/common_op_verification.hpp" #include "sparse_blas/macros.hpp" #include "sparse_blas/matrix_view_comparison.hpp" +#include "sparse_blas/sycl_helper.hpp" #include "oneapi/mkl/sparse_blas/detail/mklgpu/onemkl_sparse_blas_mklgpu.hpp" diff --git a/src/sparse_blas/common_op_verification.hpp b/src/sparse_blas/common_op_verification.hpp new file mode 100644 index 000000000..318766fb4 --- /dev/null +++ b/src/sparse_blas/common_op_verification.hpp @@ -0,0 +1,137 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#ifndef _ONEMKL_SRC_SPARSE_BLAS_COMMON_OP_VERIFICATION_HPP_ +#define _ONEMKL_SRC_SPARSE_BLAS_COMMON_OP_VERIFICATION_HPP_ + +#include <string> + +#if __has_include(<sycl/sycl.hpp>) +#include <sycl/sycl.hpp> +#else +#include <CL/sycl.hpp> +#endif + +#include "oneapi/mkl/sparse_blas/types.hpp" +#include "macros.hpp" + +namespace oneapi::mkl::sparse::detail { + +/// Throw an exception if the scalar is not accessible in the host +inline void check_ptr_is_host_accessible(const std::string& function_name, + const std::string& scalar_name, + bool is_ptr_accessible_on_host) { + if (!is_ptr_accessible_on_host) { + throw mkl::invalid_argument( + "sparse_blas", function_name, + "Scalar " + scalar_name + " must be accessible on the host for buffer functions."); + } +} + +template <typename InternalSparseMatHandleT> +void check_valid_spmm_common(const std::string& function_name, matrix_view A_view, + InternalSparseMatHandleT internal_A_handle, + dense_matrix_handle_t B_handle, dense_matrix_handle_t C_handle, + bool is_alpha_host_accessible, bool is_beta_host_accessible) { + THROW_IF_NULLPTR(function_name, internal_A_handle); + THROW_IF_NULLPTR(function_name, B_handle); + THROW_IF_NULLPTR(function_name, C_handle); + + check_all_containers_compatible(function_name, internal_A_handle, B_handle, C_handle); + if (internal_A_handle->all_use_buffer()) { + check_ptr_is_host_accessible("spmm", "alpha", is_alpha_host_accessible); + check_ptr_is_host_accessible("spmm", "beta", is_beta_host_accessible); + } + if (is_alpha_host_accessible != is_beta_host_accessible) { + throw mkl::invalid_argument( + "sparse_blas", function_name, + "Alpha and beta must both be placed on host memory or device memory."); + } + if (B_handle->dense_layout != C_handle->dense_layout) { + throw mkl::invalid_argument("sparse_blas", function_name, + "B and C matrices must use the same layout."); + } + + if (A_view.type_view != matrix_descr::general) { + throw mkl::invalid_argument("sparse_blas", function_name, + "Matrix view's `type_view` must be `matrix_descr::general`."); + } + + if (A_view.diag_view != oneapi::mkl::diag::nonunit) { + throw mkl::invalid_argument("sparse_blas", function_name, + "Matrix's diag_view must be `nonunit`."); + } +} + +template <typename InternalSparseMatHandleT> +void check_valid_spmv_common(const std::string& function_name, oneapi::mkl::transpose /*opA*/, + matrix_view A_view, InternalSparseMatHandleT internal_A_handle, + dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, + bool is_alpha_host_accessible, bool is_beta_host_accessible) { + THROW_IF_NULLPTR(function_name, internal_A_handle); + THROW_IF_NULLPTR(function_name, x_handle); + THROW_IF_NULLPTR(function_name, y_handle); + + check_all_containers_compatible(function_name, internal_A_handle, x_handle, y_handle); + if (internal_A_handle->all_use_buffer()) { + check_ptr_is_host_accessible("spmv", "alpha", is_alpha_host_accessible); + check_ptr_is_host_accessible("spmv", "beta", is_beta_host_accessible); + } + if (is_alpha_host_accessible != is_beta_host_accessible) { + throw mkl::invalid_argument( + "sparse_blas", function_name, + "Alpha and beta must both be placed on host memory or device memory."); + } + if (A_view.type_view == matrix_descr::diagonal) { + throw mkl::invalid_argument("sparse_blas", function_name, + "Matrix view's `type_view` cannot be diagonal."); + } + + if (A_view.type_view != matrix_descr::triangular && + A_view.diag_view == oneapi::mkl::diag::unit) { + throw mkl::invalid_argument( + "sparse_blas", function_name, + "`diag_view::unit` can only be used with `type_view::triangular`."); + } +} + +template <typename InternalSparseMatHandleT> +void check_valid_spsv_common(const std::string& function_name, matrix_view A_view, + InternalSparseMatHandleT internal_A_handle, + dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, + bool is_alpha_host_accessible) { + THROW_IF_NULLPTR(function_name, internal_A_handle); + THROW_IF_NULLPTR(function_name, x_handle); + THROW_IF_NULLPTR(function_name, y_handle); + + check_all_containers_compatible(function_name, internal_A_handle, x_handle, y_handle); + if (A_view.type_view != matrix_descr::triangular) { + throw mkl::invalid_argument( + "sparse_blas", function_name, + "Matrix view's `type_view` must be `matrix_descr::triangular`."); + } + + if (internal_A_handle->all_use_buffer()) { + check_ptr_is_host_accessible("spsv", "alpha", is_alpha_host_accessible); + } +} + +} // namespace oneapi::mkl::sparse::detail + +#endif // _ONEMKL_SRC_SPARSE_BLAS_COMMON_OP_VERIFICATION_HPP_ \ No newline at end of file diff --git a/src/sparse_blas/generic_container.hpp b/src/sparse_blas/generic_container.hpp index 53bd50837..c2e8476a7 100644 --- a/src/sparse_blas/generic_container.hpp +++ b/src/sparse_blas/generic_container.hpp @@ -39,10 +39,12 @@ struct generic_container { // USM pointer, nullptr if the provided data is a buffer. void* usm_ptr; - // Buffer pointer, nullptr if the provided data is a USM pointer. - // The buffer is needed to properly handle the dependencies when the handle is used. - // Use a void* type for the buffer to avoid using template arguments in every function using data handles. - // Using reinterpret does not solve the issue as the returned buffer needs the type of the original buffer for the aligned_allocator. + // Buffer pointer, nullptr if the provided data is a USM pointer. The buffer + // is needed to properly handle the dependencies when the handle is used. + // Use a void* type for the buffer to avoid using template arguments in + // every function using data handles. Using `sycl::buffer::reinterpret` does + // not solve the issue as the returned buffer needs the type of the original + // buffer for the aligned_allocator. std::shared_ptr<void> buffer_ptr; // Underlying USM or buffer data type @@ -61,6 +63,10 @@ struct generic_container { buffer_ptr(std::make_shared<sycl::buffer<T, 1>>(buffer)), data_type(get_data_type<T>()) {} + bool use_buffer() const { + return static_cast<bool>(buffer_ptr); + } + template <typename T> void set_usm_ptr(T* ptr) { usm_ptr = ptr; @@ -108,7 +114,7 @@ struct generic_dense_handle { value_container(value_buffer) {} bool all_use_buffer() const { - return static_cast<bool>(value_container.buffer_ptr); + return value_container.use_buffer(); } data_type get_value_type() const { @@ -201,6 +207,8 @@ struct generic_dense_matrix_handle : public detail::generic_dense_handle<Backend } }; +enum class sparse_format { CSR, COO }; + /// Generic sparse_matrix_handle used by all backends template <typename BackendHandleT> struct generic_sparse_handle { @@ -210,34 +218,51 @@ struct generic_sparse_handle { generic_container col_container; generic_container value_container; + sparse_format format; + std::int64_t num_rows; + std::int64_t num_cols; + std::int64_t nnz; + oneapi::mkl::index_base index; std::int32_t properties_mask; bool can_be_reset; template <typename fpType, typename intType> generic_sparse_handle(BackendHandleT backend_handle, intType* row_ptr, intType* col_ptr, - fpType* value_ptr) + fpType* value_ptr, sparse_format format, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index) : backend_handle(backend_handle), row_container(generic_container(row_ptr)), col_container(generic_container(col_ptr)), value_container(generic_container(value_ptr)), + format(format), + num_rows(num_rows), + num_cols(num_cols), + nnz(nnz), + index(index), properties_mask(0), can_be_reset(true) {} template <typename fpType, typename intType> generic_sparse_handle(BackendHandleT backend_handle, const sycl::buffer<intType, 1> row_buffer, const sycl::buffer<intType, 1> col_buffer, - const sycl::buffer<fpType, 1> value_buffer) + const sycl::buffer<fpType, 1> value_buffer, sparse_format format, + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, + oneapi::mkl::index_base index) : backend_handle(backend_handle), row_container(row_buffer), col_container(col_buffer), value_container(value_buffer), + format(format), + num_rows(num_rows), + num_cols(num_cols), + nnz(nnz), + index(index), properties_mask(0), can_be_reset(true) {} bool all_use_buffer() const { - return static_cast<bool>(value_container.buffer_ptr) && - static_cast<bool>(row_container.buffer_ptr) && - static_cast<bool>(col_container.buffer_ptr); + return value_container.use_buffer() && row_container.use_buffer() && + col_container.use_buffer(); } data_type get_value_type() const { @@ -248,19 +273,20 @@ struct generic_sparse_handle { return row_container.data_type; } - void set_matrix_property(oneapi::mkl::sparse::matrix_property property) { + void set_matrix_property(matrix_property property) { properties_mask |= matrix_property_to_mask(property); } - bool has_matrix_property(oneapi::mkl::sparse::matrix_property property) { + bool has_matrix_property(matrix_property property) { return properties_mask & matrix_property_to_mask(property); } private: - std::int32_t matrix_property_to_mask(oneapi::mkl::sparse::matrix_property property) { + std::int32_t matrix_property_to_mask(matrix_property property) { switch (property) { - case oneapi::mkl::sparse::matrix_property::symmetric: return 1 << 0; - case oneapi::mkl::sparse::matrix_property::sorted: return 1 << 1; + case matrix_property::symmetric: return 1 << 0; + case matrix_property::sorted: return 1 << 1; + case matrix_property::sorted_by_rows: return 1 << 2; default: throw oneapi::mkl::invalid_argument( "sparse_blas", "set_matrix_property", @@ -321,12 +347,38 @@ void check_all_containers_compatible(const std::string& function_name, } } -template <typename T, typename DependenciesT> -sycl::event submit_release(sycl::queue& queue, T* ptr, const DependenciesT& dependencies) { - return queue.submit([&](sycl::handler& cgh) { - cgh.depends_on(dependencies); - cgh.host_task([=]() { delete ptr; }); - }); +template <typename fpType, typename InternalHandleT> +void check_can_reset_value_handle(const std::string& function_name, + InternalHandleT* internal_handle, bool expect_buffer) { + if (internal_handle->get_value_type() != detail::get_data_type<fpType>()) { + throw oneapi::mkl::invalid_argument( + "sparse_blas", function_name, + "Incompatible data types expected " + + data_type_to_str(internal_handle->get_value_type()) + " but got " + + data_type_to_str(detail::get_data_type<fpType>())); + } + if (internal_handle->all_use_buffer() != expect_buffer) { + throw oneapi::mkl::invalid_argument( + "sparse_blas", function_name, "Cannot change the container type between buffer or USM"); + } +} + +template <typename fpType, typename intType, typename InternalHandleT> +void check_can_reset_sparse_handle(const std::string& function_name, + InternalHandleT* internal_smhandle, bool expect_buffer) { + check_can_reset_value_handle<fpType>(function_name, internal_smhandle, expect_buffer); + if (internal_smhandle->get_int_type() != detail::get_data_type<intType>()) { + throw oneapi::mkl::invalid_argument( + "sparse_blas", function_name, + "Incompatible data types expected " + + data_type_to_str(internal_smhandle->get_int_type()) + " but got " + + data_type_to_str(detail::get_data_type<intType>())); + } + if (!internal_smhandle->can_be_reset) { + throw mkl::unimplemented( + "sparse_blas", function_name, + "The backend does not support reseting the matrix handle's data after it was used in a computation."); + } } } // namespace oneapi::mkl::sparse::detail diff --git a/src/sparse_blas/macros.hpp b/src/sparse_blas/macros.hpp index 7eba01390..72aa39a75 100644 --- a/src/sparse_blas/macros.hpp +++ b/src/sparse_blas/macros.hpp @@ -36,10 +36,91 @@ FOR_EACH_FP_AND_INT_TYPE_HELPER(DEFINE_MACRO, std::int32_t, _i32); \ FOR_EACH_FP_AND_INT_TYPE_HELPER(DEFINE_MACRO, std::int64_t, _i64) +#define INSTANTIATE_DENSE_VECTOR_FUNCS(FP_TYPE, FP_SUFFIX) \ + template void init_dense_vector<FP_TYPE>( \ + sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t * p_dvhandle, \ + std::int64_t size, sycl::buffer<FP_TYPE, 1> val); \ + template void init_dense_vector<FP_TYPE>( \ + sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t * p_dvhandle, \ + std::int64_t size, FP_TYPE* val); \ + template void set_dense_vector_data<FP_TYPE>( \ + sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle, \ + std::int64_t size, sycl::buffer<FP_TYPE, 1> val); \ + template void set_dense_vector_data<FP_TYPE>( \ + sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle, \ + std::int64_t size, FP_TYPE* val) + +#define INSTANTIATE_DENSE_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX) \ + template void init_dense_matrix<FP_TYPE>( \ + sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t * p_dmhandle, \ + std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ + oneapi::mkl::layout dense_layout, sycl::buffer<FP_TYPE, 1> val); \ + template void init_dense_matrix<FP_TYPE>( \ + sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t * p_dmhandle, \ + std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ + oneapi::mkl::layout dense_layout, FP_TYPE* val); \ + template void set_dense_matrix_data<FP_TYPE>( \ + sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, \ + std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ + oneapi::mkl::layout dense_layout, sycl::buffer<FP_TYPE, 1> val); \ + template void set_dense_matrix_data<FP_TYPE>( \ + sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, \ + std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ + oneapi::mkl::layout dense_layout, FP_TYPE* val) + +#define INSTANTIATE_COO_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX) \ + template void init_coo_matrix<FP_TYPE, INT_TYPE>( \ + sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ + oneapi::mkl::index_base index, sycl::buffer<INT_TYPE, 1> row_ind, \ + sycl::buffer<INT_TYPE, 1> col_ind, sycl::buffer<FP_TYPE, 1> val); \ + template void init_coo_matrix<FP_TYPE, INT_TYPE>( \ + sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ + oneapi::mkl::index_base index, INT_TYPE* row_ind, INT_TYPE* col_ind, FP_TYPE* val); \ + template void set_coo_matrix_data<FP_TYPE, INT_TYPE>( \ + sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ + sycl::buffer<INT_TYPE, 1> row_ind, sycl::buffer<INT_TYPE, 1> col_ind, \ + sycl::buffer<FP_TYPE, 1> val); \ + template void set_coo_matrix_data<FP_TYPE, INT_TYPE>( \ + sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, INT_TYPE* row_ind, \ + INT_TYPE* col_ind, FP_TYPE* val) + +#define INSTANTIATE_CSR_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX) \ + template void init_csr_matrix<FP_TYPE, INT_TYPE>( \ + sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ + oneapi::mkl::index_base index, sycl::buffer<INT_TYPE, 1> row_ptr, \ + sycl::buffer<INT_TYPE, 1> col_ind, sycl::buffer<FP_TYPE, 1> val); \ + template void init_csr_matrix<FP_TYPE, INT_TYPE>( \ + sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ + oneapi::mkl::index_base index, INT_TYPE* row_ptr, INT_TYPE* col_ind, FP_TYPE* val); \ + template void set_csr_matrix_data<FP_TYPE, INT_TYPE>( \ + sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ + sycl::buffer<INT_TYPE, 1> row_ptr, sycl::buffer<INT_TYPE, 1> col_ind, \ + sycl::buffer<FP_TYPE, 1> val); \ + template void set_csr_matrix_data<FP_TYPE, INT_TYPE>( \ + sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, INT_TYPE* row_ptr, \ + INT_TYPE* col_ind, FP_TYPE* val) + #define THROW_IF_NULLPTR(FUNC_NAME, PTR) \ if (!(PTR)) { \ throw mkl::uninitialized("sparse_blas", FUNC_NAME, \ std::string(#PTR) + " must not be nullptr."); \ } +#define CHECK_DESCR_MATCH(descr, argument, optimize_func_name) \ + do { \ + if (descr->last_optimized_##argument != argument) { \ + throw mkl::invalid_argument( \ + "sparse_blas", __func__, \ + #argument " argument must match with the previous call to " #optimize_func_name); \ + } \ + } while (0) + #endif // _ONEMKL_SPARSE_BLAS_MACROS_HPP_ diff --git a/src/sparse_blas/sycl_helper.hpp b/src/sparse_blas/sycl_helper.hpp new file mode 100644 index 000000000..1a055b405 --- /dev/null +++ b/src/sparse_blas/sycl_helper.hpp @@ -0,0 +1,80 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#ifndef _ONEMKL_SRC_SPARSE_BLAS_SYCL_HELPER_HPP_ +#define _ONEMKL_SRC_SPARSE_BLAS_SYCL_HELPER_HPP_ + +#if __has_include(<sycl/sycl.hpp>) +#include <sycl/sycl.hpp> +#else +#include <CL/sycl.hpp> +#endif + +namespace oneapi::mkl::sparse::detail { + +/// Return whether a pointer is accessible on the host +template <typename T> +inline bool is_ptr_accessible_on_host(sycl::queue queue, const T* host_or_device_ptr) { + auto alloc_type = sycl::get_pointer_type(host_or_device_ptr, queue.get_context()); + return alloc_type == sycl::usm::alloc::host || alloc_type == sycl::usm::alloc::shared || + alloc_type == sycl::usm::alloc::unknown; +} + +/// Return a scalar on the host from a pointer to host or device memory +template <typename T> +inline T get_scalar_on_host(sycl::queue& queue, const T* host_or_device_ptr, + bool is_ptr_accessible_on_host) { + if (is_ptr_accessible_on_host) { + return *host_or_device_ptr; + } + T scalar; + auto event = queue.copy(host_or_device_ptr, &scalar, 1); + event.wait_and_throw(); + return scalar; +} + +/// Submit the release of \p ptr in a host_task waiting on the dependencies +template <typename T> +sycl::event submit_release(sycl::queue& queue, T* ptr, + const std::vector<sycl::event>& dependencies) { + return queue.submit([&](sycl::handler& cgh) { + cgh.depends_on(dependencies); + cgh.host_task([=]() { delete ptr; }); + }); +} + +/// Merge multiple event dependencies into one +inline sycl::event collapse_dependencies(sycl::queue& queue, + const std::vector<sycl::event>& dependencies) { + if (dependencies.empty()) { + return {}; + } + else if (dependencies.size() == 1) { + return dependencies[0]; + } + + return queue.submit([&](sycl::handler& cgh) { + cgh.depends_on(dependencies); + cgh.host_task([=]() {}); + }); +} + +} // namespace oneapi::mkl::sparse::detail + +#endif // _ONEMKL_SRC_SPARSE_BLAS_SYCL_HELPER_HPP_ diff --git a/tests/unit_tests/CMakeLists.txt b/tests/unit_tests/CMakeLists.txt index 235f8c8e5..d250a03a0 100644 --- a/tests/unit_tests/CMakeLists.txt +++ b/tests/unit_tests/CMakeLists.txt @@ -192,6 +192,11 @@ foreach(domain ${TEST_TARGET_DOMAINS}) list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_dft_portfft) endif() + if(domain STREQUAL "sparse_blas" AND ENABLE_CUSPARSE_BACKEND) + add_dependencies(test_main_${domain}_ct onemkl_${domain}_cusparse) + list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_cusparse) + endif() + target_link_libraries(test_main_${domain}_ct PUBLIC gtest gtest_main diff --git a/tests/unit_tests/include/test_helper.hpp b/tests/unit_tests/include/test_helper.hpp index c6eaf3421..cb27c9098 100644 --- a/tests/unit_tests/include/test_helper.hpp +++ b/tests/unit_tests/include/test_helper.hpp @@ -176,6 +176,13 @@ #define TEST_RUN_PORTFFT_SELECT(q, func, ...) #endif +#ifdef ONEMKL_ENABLE_CUSPARSE_BACKEND +#define TEST_RUN_NVIDIAGPU_CUSPARSE_SELECT(q, func, ...) \ + func(oneapi::mkl::backend_selector<oneapi::mkl::backend::cusparse>{ q }, __VA_ARGS__) +#else +#define TEST_RUN_NVIDIAGPU_CUSPARSE_SELECT(q, func, ...) +#endif + #ifndef __HIPSYCL__ #define CHECK_HOST_OR_CPU(q) q.get_device().is_cpu() #else @@ -268,6 +275,9 @@ if (vendor_id == INTEL_ID) { \ TEST_RUN_INTELGPU_SELECT(q, func, __VA_ARGS__); \ } \ + else if (vendor_id == NVIDIA_ID) { \ + TEST_RUN_NVIDIAGPU_CUSPARSE_SELECT(q, func, __VA_ARGS__); \ + } \ } \ } while (0); diff --git a/tests/unit_tests/main_test.cpp b/tests/unit_tests/main_test.cpp index 7e2ad079a..fa7dffcc6 100644 --- a/tests/unit_tests/main_test.cpp +++ b/tests/unit_tests/main_test.cpp @@ -126,7 +126,8 @@ int main(int argc, char** argv) { #if !defined(ONEMKL_ENABLE_CUBLAS_BACKEND) && !defined(ONEMKL_ENABLE_CURAND_BACKEND) && \ !defined(ONEMKL_ENABLE_CUSOLVER_BACKEND) && \ !defined(ONEMKL_ENABLE_PORTBLAS_BACKEND_NVIDIA_GPU) && \ - !defined(ONEMKL_ENABLE_CUFFT_BACKEND) && !defined(ONEMKL_ENABLE_PORTFFT_BACKEND) + !defined(ONEMKL_ENABLE_CUFFT_BACKEND) && !defined(ONEMKL_ENABLE_PORTFFT_BACKEND) && \ + !defined(ONEMKL_ENABLE_CUSPARSE_BACKEND) if (dev.is_gpu() && vendor_id == NVIDIA_ID) continue; #endif diff --git a/tests/unit_tests/sparse_blas/include/test_common.hpp b/tests/unit_tests/sparse_blas/include/test_common.hpp index 628f55e2e..6637e0daa 100644 --- a/tests/unit_tests/sparse_blas/include/test_common.hpp +++ b/tests/unit_tests/sparse_blas/include/test_common.hpp @@ -59,12 +59,39 @@ enum sparse_matrix_format_t { COO, }; -static std::vector<std::set<oneapi::mkl::sparse::matrix_property>> test_matrix_properties{ - { oneapi::mkl::sparse::matrix_property::sorted }, - { oneapi::mkl::sparse::matrix_property::symmetric }, - { oneapi::mkl::sparse::matrix_property::sorted, - oneapi::mkl::sparse::matrix_property::symmetric } -}; +inline std::set<oneapi::mkl::sparse::matrix_property> get_default_matrix_properties( + sycl::queue queue, sparse_matrix_format_t format) { + auto vendor_id = oneapi::mkl::get_device_id(queue); + if (vendor_id == oneapi::mkl::device::nvidiagpu && format == sparse_matrix_format_t::COO) { + return { oneapi::mkl::sparse::matrix_property::sorted_by_rows }; + } + return {}; +} + +/// Return the combinations of matrix_properties to test other than the default +inline std::vector<std::set<oneapi::mkl::sparse::matrix_property>> +get_all_matrix_properties_combinations(sycl::queue queue, sparse_matrix_format_t format) { + auto vendor_id = oneapi::mkl::get_device_id(queue); + if (vendor_id == oneapi::mkl::device::nvidiagpu && format == sparse_matrix_format_t::COO) { + // Ensure all the sets have the sorted or sorted_by_rows properties + return { { oneapi::mkl::sparse::matrix_property::sorted }, + { oneapi::mkl::sparse::matrix_property::sorted_by_rows, + oneapi::mkl::sparse::matrix_property::symmetric }, + { oneapi::mkl::sparse::matrix_property::sorted, + oneapi::mkl::sparse::matrix_property::symmetric } }; + } + + std::vector<std::set<oneapi::mkl::sparse::matrix_property>> properties_combinations{ + { oneapi::mkl::sparse::matrix_property::sorted }, + { oneapi::mkl::sparse::matrix_property::symmetric }, + { oneapi::mkl::sparse::matrix_property::sorted, + oneapi::mkl::sparse::matrix_property::symmetric } + }; + if (format == sparse_matrix_format_t::COO) { + properties_combinations.push_back({ oneapi::mkl::sparse::matrix_property::sorted_by_rows }); + } + return properties_combinations; +} void print_error_code(sycl::exception const& e); @@ -207,9 +234,9 @@ template <typename fpType> fpType generate_data(bool is_diag) { rand_scalar<fpType> rand_data; if (is_diag) { - // Guarantee an amplitude >= 0.1 + // Guarantee a large amplitude fpType sign = (std::rand() % 2) * 2 - 1; - return rand_data(0.1, 0.5) * sign; + return rand_data(10, 20) * sign; } return rand_data(-0.5, 0.5); } @@ -337,8 +364,18 @@ intType generate_random_matrix(sparse_matrix_format_t format, const intType nrow /// In CSR format, the elements within a row are shuffled without changing ia. /// In COO format, all the elements are shuffled. template <typename fpType, typename intType> -void shuffle_sparse_matrix(sparse_matrix_format_t format, intType indexing, intType* ia, - intType* ja, fpType* a, intType nnz, std::size_t nrows) { +void shuffle_sparse_matrix_if_needed( + sparse_matrix_format_t format, + const std::set<oneapi::mkl::sparse::matrix_property>& matrix_properties, intType indexing, + intType* ia, intType* ja, fpType* a, intType nnz, std::size_t nrows) { + const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) != + matrix_properties.cend(); + if (is_sorted) { + return; + } + const bool is_sorted_by_rows = + matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted_by_rows) != + matrix_properties.cend(); if (format == sparse_matrix_format_t::CSR) { for (std::size_t i = 0; i < nrows; ++i) { intType nnz_row = ia[i + 1] - ia[i]; @@ -349,18 +386,40 @@ void shuffle_sparse_matrix(sparse_matrix_format_t format, intType indexing, intT std::swap(a[q], a[j]); } } + // sorted_by_rows does not impact CSR } else if (format == sparse_matrix_format_t::COO) { - for (std::size_t i = 0; i < static_cast<std::size_t>(nnz); ++i) { - intType q = std::rand() % nnz; - // Swap elements i and q - std::swap(ia[q], ia[i]); - std::swap(ja[q], ja[i]); - std::swap(a[q], a[i]); + if (is_sorted_by_rows) { + std::size_t linear_idx = 0; + for (std::size_t i = 0; i < nrows; ++i) { + // Count the number of non-zero elements for the given row + std::size_t nnz_row = 1; + while (linear_idx + nnz_row < static_cast<std::size_t>(nnz) && + ia[linear_idx] == ia[linear_idx + nnz_row]) { + ++nnz_row; + } + for (std::size_t j = 0; j < nnz_row; ++j) { + // Swap elements within the same row + std::size_t q = linear_idx + (static_cast<std::size_t>(std::rand()) % nnz_row); + // Swap elements j and q + std::swap(ja[q], ja[linear_idx + j]); + std::swap(a[q], a[linear_idx + j]); + } + linear_idx += nnz_row; + } + } + else { + for (std::size_t i = 0; i < static_cast<std::size_t>(nnz); ++i) { + intType q = std::rand() % nnz; + // Swap elements i and q + std::swap(ia[q], ia[i]); + std::swap(ja[q], ja[i]); + std::swap(a[q], a[i]); + } } } else { - throw oneapi::mkl::exception("sparse_blas", "shuffle_sparse_matrix", + throw oneapi::mkl::exception("sparse_blas", "shuffle_sparse_matrix_if_needed", "Internal error: unsupported format"); } } diff --git a/tests/unit_tests/sparse_blas/include/test_spmm.hpp b/tests/unit_tests/sparse_blas/include/test_spmm.hpp index 17874cd63..153862f53 100644 --- a/tests/unit_tests/sparse_blas/include/test_spmm.hpp +++ b/tests/unit_tests/sparse_blas/include/test_spmm.hpp @@ -58,6 +58,7 @@ void test_helper_with_format_with_transpose( const std::vector<oneapi::mkl::sparse::spmm_alg>& non_default_algorithms, oneapi::mkl::transpose transpose_A, oneapi::mkl::transpose transpose_B, int& num_passed, int& num_skipped) { + sycl::property_list queue_properties; double density_A_matrix = 0.8; fpType fp_zero = set_fp_value<fpType>()(0.f, 0.f); fpType fp_one = set_fp_value<fpType>()(1.f, 0.f); @@ -65,10 +66,13 @@ void test_helper_with_format_with_transpose( oneapi::mkl::layout col_major = oneapi::mkl::layout::col_major; oneapi::mkl::sparse::spmm_alg default_alg = oneapi::mkl::sparse::spmm_alg::default_alg; oneapi::mkl::sparse::matrix_view default_A_view; - std::set<oneapi::mkl::sparse::matrix_property> no_properties; bool no_reset_data = false; bool no_scalars_on_device = false; + // Queue is only used to get which matrix_property should be used for the tests. + sycl::queue properties_queue(*dev); + auto default_properties = get_default_matrix_properties(properties_queue, format); + { int m = 4, k = 6, n = 5; int nrows_A = (transpose_A != oneapi::mkl::transpose::nontrans) ? k : m; @@ -82,107 +86,119 @@ void test_helper_with_format_with_transpose( // Basic test EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc, - default_alg, default_A_view, no_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Reset data EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc, - default_alg, default_A_view, no_properties, true, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, + default_properties, true, no_scalars_on_device), num_passed, num_skipped); // Test alpha and beta on the device EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc, - default_alg, default_A_view, no_properties, no_reset_data, true), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, + default_properties, no_reset_data, true), num_passed, num_skipped); // Test index_base 1 EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, - oneapi::mkl::index_base::one, col_major, transpose_A, transpose_B, - fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, no_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, oneapi::mkl::index_base::one, col_major, transpose_A, + transpose_B, fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test non-default alpha EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, set_fp_value<fpType>()(2.f, 1.5f), - fp_zero, ldb, ldc, default_alg, default_A_view, no_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + set_fp_value<fpType>()(2.f, 1.5f), fp_zero, ldb, ldc, default_alg, + default_A_view, default_properties, no_reset_data, + no_scalars_on_device), num_passed, num_skipped); // Test non-default beta EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, fp_one, - set_fp_value<fpType>()(3.2f, 1.f), ldb, ldc, default_alg, - default_A_view, no_properties, no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_one, set_fp_value<fpType>()(3.2f, 1.f), ldb, ldc, default_alg, + default_A_view, default_properties, no_reset_data, + no_scalars_on_device), num_passed, num_skipped); // Test 0 alpha EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, fp_zero, fp_one, ldb, ldc, - default_alg, default_A_view, no_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_zero, fp_one, ldb, ldc, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test 0 alpha and beta EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, fp_zero, fp_zero, ldb, ldc, - default_alg, default_A_view, no_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_zero, fp_zero, ldb, ldc, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test non-default ldb EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb + 5, ldc, - default_alg, default_A_view, no_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_one, fp_zero, ldb + 5, ldc, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test non-default ldc EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc + 6, - default_alg, default_A_view, no_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_one, fp_zero, ldb, ldc + 6, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test row major layout EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - oneapi::mkl::layout::row_major, transpose_A, transpose_B, fp_one, - fp_zero, ncols_B, ncols_C, default_alg, default_A_view, no_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, oneapi::mkl::layout::row_major, + transpose_A, transpose_B, fp_one, fp_zero, ncols_B, ncols_C, + default_alg, default_A_view, default_properties, no_reset_data, + no_scalars_on_device), num_passed, num_skipped); // Test int64 indices long long_nrows_A = 27, long_ncols_A = 13, long_ncols_C = 6; auto [long_ldc, long_ldb] = swap_if_transposed(transpose_A, long_nrows_A, long_ncols_A); EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i64(dev, format, long_nrows_A, long_ncols_A, long_ncols_C, - density_A_matrix, index_zero, col_major, transpose_A, transpose_B, - fp_one, fp_zero, long_ldb, long_ldc, default_alg, default_A_view, - no_properties, no_reset_data, no_scalars_on_device), + test_functor_i64(dev, queue_properties, format, long_nrows_A, long_ncols_A, + long_ncols_C, density_A_matrix, index_zero, col_major, transpose_A, + transpose_B, fp_one, fp_zero, long_ldb, long_ldc, default_alg, + default_A_view, default_properties, no_reset_data, + no_scalars_on_device), num_passed, num_skipped); // Test other algorithms for (auto alg : non_default_algorithms) { EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, - index_zero, col_major, transpose_A, transpose_B, fp_one, fp_zero, - ldb, ldc, alg, default_A_view, no_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_one, fp_zero, ldb, ldc, alg, default_A_view, default_properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); } // Test matrix properties - for (auto properties : test_matrix_properties) { + for (auto properties : get_all_matrix_properties_combinations(properties_queue, format)) { EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, - index_zero, col_major, transpose_A, transpose_B, fp_one, fp_zero, - ldb, ldc, default_alg, default_A_view, properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); } + // In-order queue + EXPECT_TRUE_OR_FUTURE_SKIP( + test_functor_i32(dev, { sycl::property::queue::in_order{} }, format, nrows_A, ncols_A, + ncols_C, density_A_matrix, index_zero, col_major, transpose_A, + transpose_B, fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), + num_passed, num_skipped); } { // Test different sizes @@ -195,10 +211,10 @@ void test_helper_with_format_with_transpose( int ldb = nrows_B; int ldc = nrows_C; EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc, - default_alg, default_A_view, no_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); } } diff --git a/tests/unit_tests/sparse_blas/include/test_spmv.hpp b/tests/unit_tests/sparse_blas/include/test_spmv.hpp index 654a1bfd4..50b5aa7db 100644 --- a/tests/unit_tests/sparse_blas/include/test_spmv.hpp +++ b/tests/unit_tests/sparse_blas/include/test_spmv.hpp @@ -56,6 +56,7 @@ void test_helper_with_format_with_transpose( sparse_matrix_format_t format, const std::vector<oneapi::mkl::sparse::spmv_alg>& non_default_algorithms, oneapi::mkl::transpose transpose_val, int& num_passed, int& num_skipped) { + sycl::property_list queue_properties; double density_A_matrix = 0.8; fpType fp_zero = set_fp_value<fpType>()(0.f, 0.f); fpType fp_one = set_fp_value<fpType>()(1.f, 0.f); @@ -63,139 +64,152 @@ void test_helper_with_format_with_transpose( oneapi::mkl::index_base index_zero = oneapi::mkl::index_base::zero; oneapi::mkl::sparse::spmv_alg default_alg = oneapi::mkl::sparse::spmv_alg::default_alg; oneapi::mkl::sparse::matrix_view default_A_view; - std::set<oneapi::mkl::sparse::matrix_property> no_properties; bool no_reset_data = false; bool no_scalars_on_device = false; + // Queue is only used to get which matrix_property should be used for the tests. + sycl::queue properties_queue(*dev); + auto default_properties = get_default_matrix_properties(properties_queue, format); + // Basic test EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, default_A_view, no_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Reset data EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, default_A_view, no_properties, true, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, default_A_view, + default_properties, true, no_scalars_on_device), num_passed, num_skipped); // Test alpha and beta on the device EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, default_A_view, no_properties, no_reset_data, - true), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, default_A_view, + default_properties, no_reset_data, true), num_passed, num_skipped); // Test index_base 1 EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, oneapi::mkl::index_base::one, transpose_val, fp_one, fp_zero, default_alg, - default_A_view, no_properties, no_reset_data, no_scalars_on_device), + default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test non-default alpha EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - set_fp_value<fpType>()(2.f, 1.5f), fp_zero, default_alg, default_A_view, - no_properties, no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, set_fp_value<fpType>()(2.f, 1.5f), fp_zero, + default_alg, default_A_view, default_properties, no_reset_data, + no_scalars_on_device), num_passed, num_skipped); // Test non-default beta EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, set_fp_value<fpType>()(3.2f, 1.f), default_alg, default_A_view, - no_properties, no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, set_fp_value<fpType>()(3.2f, 1.f), + default_alg, default_A_view, default_properties, no_reset_data, + no_scalars_on_device), num_passed, num_skipped); // Test 0 alpha EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_zero, fp_one, default_alg, default_A_view, no_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_zero, fp_one, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test 0 alpha and beta EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_zero, fp_zero, default_alg, default_A_view, no_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_zero, fp_zero, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test int64 indices EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i64(dev, format, 27L, 13L, density_A_matrix, index_zero, transpose_val, fp_one, - fp_zero, default_alg, default_A_view, no_properties, no_reset_data, - no_scalars_on_device), + test_functor_i64(dev, queue_properties, format, 27L, 13L, density_A_matrix, index_zero, + transpose_val, fp_one, fp_zero, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Lower triangular oneapi::mkl::sparse::matrix_view triangular_A_view( oneapi::mkl::sparse::matrix_descr::triangular); EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, triangular_A_view, no_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, triangular_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Upper triangular triangular_A_view.uplo_view = oneapi::mkl::uplo::upper; EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, triangular_A_view, no_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, triangular_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Lower triangular unit diagonal oneapi::mkl::sparse::matrix_view triangular_unit_A_view( oneapi::mkl::sparse::matrix_descr::triangular); triangular_unit_A_view.diag_view = oneapi::mkl::diag::unit; EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, triangular_unit_A_view, no_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, + triangular_unit_A_view, default_properties, no_reset_data, + no_scalars_on_device), num_passed, num_skipped); // Upper triangular unit diagonal triangular_A_view.uplo_view = oneapi::mkl::uplo::upper; EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, triangular_unit_A_view, no_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, + triangular_unit_A_view, default_properties, no_reset_data, + no_scalars_on_device), num_passed, num_skipped); // Lower symmetric oneapi::mkl::sparse::matrix_view symmetric_view(oneapi::mkl::sparse::matrix_descr::symmetric); EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, symmetric_view, no_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, symmetric_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Upper symmetric symmetric_view.uplo_view = oneapi::mkl::uplo::upper; EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, symmetric_view, no_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, symmetric_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Lower hermitian oneapi::mkl::sparse::matrix_view hermitian_view(oneapi::mkl::sparse::matrix_descr::hermitian); EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, hermitian_view, no_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, hermitian_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Upper hermitian hermitian_view.uplo_view = oneapi::mkl::uplo::upper; EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, hermitian_view, no_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, hermitian_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test other algorithms for (auto alg : non_default_algorithms) { EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, - transpose_val, fp_one, fp_zero, alg, default_A_view, no_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); } // Test matrix properties - for (auto properties : test_matrix_properties) { + for (auto properties : get_all_matrix_properties_combinations(properties_queue, format)) { EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, - transpose_val, fp_one, fp_zero, default_alg, default_A_view, - properties, no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, + default_A_view, properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); } + // In-order queue + EXPECT_TRUE_OR_FUTURE_SKIP( + test_functor_i32(dev, { sycl::property::queue::in_order{} }, format, nrows_A, ncols_A, + density_A_matrix, index_zero, transpose_val, fp_one, fp_zero, default_alg, + default_A_view, default_properties, no_reset_data, no_scalars_on_device), + num_passed, num_skipped); } /** diff --git a/tests/unit_tests/sparse_blas/include/test_spsv.hpp b/tests/unit_tests/sparse_blas/include/test_spsv.hpp index 032a0875b..94f5eacb1 100644 --- a/tests/unit_tests/sparse_blas/include/test_spsv.hpp +++ b/tests/unit_tests/sparse_blas/include/test_spsv.hpp @@ -51,6 +51,7 @@ void test_helper_with_format(testFunctorI32 test_functor_i32, testFunctorI64 tes sycl::device* dev, sparse_matrix_format_t format, oneapi::mkl::transpose transpose_val, int& num_passed, int& num_skipped) { + sycl::property_list queue_properties; double density_A_matrix = 0.144; fpType alpha = set_fp_value<fpType>()(1.f, 0.f); int m = 277; @@ -60,89 +61,104 @@ void test_helper_with_format(testFunctorI32 test_functor_i32, testFunctorI64 tes oneapi::mkl::sparse::matrix_view default_A_view(oneapi::mkl::sparse::matrix_descr::triangular); oneapi::mkl::sparse::matrix_view upper_A_view(oneapi::mkl::sparse::matrix_descr::triangular); upper_A_view.uplo_view = oneapi::mkl::uplo::upper; - std::set<oneapi::mkl::sparse::matrix_property> no_properties; bool no_reset_data = false; bool no_scalars_on_device = false; + // Queue is only used to get which matrix_property should be used for the tests. + sycl::queue properties_queue(*dev); + auto default_properties = get_default_matrix_properties(properties_queue, format); + // Basic test - EXPECT_TRUE_OR_FUTURE_SKIP(test_functor_i32(dev, format, m, density_A_matrix, index_zero, - transpose_val, alpha, default_alg, default_A_view, - no_properties, no_reset_data, no_scalars_on_device), - num_passed, num_skipped); + EXPECT_TRUE_OR_FUTURE_SKIP( + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, default_alg, default_A_view, default_properties, + no_reset_data, no_scalars_on_device), + num_passed, num_skipped); // Reset data EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - default_alg, default_A_view, no_properties, true, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, default_alg, default_A_view, default_properties, + true, no_scalars_on_device), num_passed, num_skipped); // Test alpha on the device EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - default_alg, default_A_view, no_properties, no_reset_data, true), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, default_alg, default_A_view, default_properties, + no_reset_data, true), num_passed, num_skipped); // Test index_base 1 EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, oneapi::mkl::index_base::one, - transpose_val, alpha, default_alg, default_A_view, no_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, + oneapi::mkl::index_base::one, transpose_val, alpha, default_alg, + default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test upper triangular matrix - EXPECT_TRUE_OR_FUTURE_SKIP(test_functor_i32(dev, format, m, density_A_matrix, index_zero, - transpose_val, alpha, default_alg, upper_A_view, - no_properties, no_reset_data, no_scalars_on_device), - num_passed, num_skipped); + EXPECT_TRUE_OR_FUTURE_SKIP( + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, default_alg, upper_A_view, default_properties, + no_reset_data, no_scalars_on_device), + num_passed, num_skipped); // Test lower triangular unit diagonal matrix oneapi::mkl::sparse::matrix_view triangular_unit_A_view( oneapi::mkl::sparse::matrix_descr::triangular); triangular_unit_A_view.diag_view = oneapi::mkl::diag::unit; EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - default_alg, triangular_unit_A_view, no_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, default_alg, triangular_unit_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test upper triangular unit diagonal matrix triangular_unit_A_view.uplo_view = oneapi::mkl::uplo::upper; EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - default_alg, triangular_unit_A_view, no_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, default_alg, triangular_unit_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test non-default alpha EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, - set_fp_value<fpType>()(2.f, 1.5f), default_alg, default_A_view, - no_properties, no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, set_fp_value<fpType>()(2.f, 1.5f), default_alg, + default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test int64 indices - EXPECT_TRUE_OR_FUTURE_SKIP(test_functor_i64(dev, format, 15L, density_A_matrix, index_zero, - transpose_val, alpha, default_alg, default_A_view, - no_properties, no_reset_data, no_scalars_on_device), - num_passed, num_skipped); + EXPECT_TRUE_OR_FUTURE_SKIP( + test_functor_i64(dev, queue_properties, format, 15L, density_A_matrix, index_zero, + transpose_val, alpha, default_alg, default_A_view, default_properties, + no_reset_data, no_scalars_on_device), + num_passed, num_skipped); // Test lower no_optimize_alg EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - no_optimize_alg, default_A_view, no_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, no_optimize_alg, default_A_view, default_properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test upper no_optimize_alg - EXPECT_TRUE_OR_FUTURE_SKIP(test_functor_i32(dev, format, m, density_A_matrix, index_zero, - transpose_val, alpha, no_optimize_alg, upper_A_view, - no_properties, no_reset_data, no_scalars_on_device), - num_passed, num_skipped); + EXPECT_TRUE_OR_FUTURE_SKIP( + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, no_optimize_alg, upper_A_view, default_properties, + no_reset_data, no_scalars_on_device), + num_passed, num_skipped); // Test matrix properties - for (auto properties : test_matrix_properties) { + for (auto properties : get_all_matrix_properties_combinations(properties_queue, format)) { // Basic test with matrix properties EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - default_alg, default_A_view, properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, default_alg, default_A_view, properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test lower no_optimize_alg with matrix properties EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - no_optimize_alg, default_A_view, properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, no_optimize_alg, default_A_view, properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); } + // In-order queue + EXPECT_TRUE_OR_FUTURE_SKIP( + test_functor_i32(dev, { sycl::property::queue::in_order{} }, format, m, density_A_matrix, + index_zero, transpose_val, alpha, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), + num_passed, num_skipped); } /** diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp index 0d95630bf..50f0fb2e7 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp @@ -28,8 +28,9 @@ extern std::vector<sycl::device*> devices; namespace { template <typename fpType, typename intType> -int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, intType ncols_A, - intType ncols_C, double density_A_matrix, oneapi::mkl::index_base index, +int test_spmm(sycl::device* dev, sycl::property_list queue_properties, + sparse_matrix_format_t format, intType nrows_A, intType ncols_A, intType ncols_C, + double density_A_matrix, oneapi::mkl::index_base index, oneapi::mkl::layout dense_matrix_layout, oneapi::mkl::transpose transpose_A, oneapi::mkl::transpose transpose_B, fpType alpha, fpType beta, intType ldb, intType ldc, oneapi::mkl::sparse::spmm_alg alg, @@ -40,7 +41,7 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, // Scalars on the device is not planned to be supported with the buffer API return 1; } - sycl::queue main_queue(*dev, exception_handler_t()); + sycl::queue main_queue(*dev, exception_handler_t(), queue_properties); if (require_square_matrix(A_view, matrix_properties)) { ncols_A = nrows_A; @@ -51,8 +52,6 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, auto [opa_nrows, opa_ncols] = swap_if_transposed<std::size_t>(transpose_A, nrows_A, ncols_A); auto [opb_nrows, opb_ncols] = swap_if_transposed<std::int64_t>(transpose_B, opa_ncols, ncols_C); intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1; - const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) != - matrix_properties.cend(); const bool is_symmetric = matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) != matrix_properties.cend(); @@ -73,10 +72,9 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, std::vector<fpType> c_ref_host(c_host); // Shuffle ordering of column indices/values to test sortedness - if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), a_host.data(), nnz, - static_cast<std::size_t>(nrows_A)); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), nnz, + static_cast<std::size_t>(nrows_A)); auto ia_buf = make_buffer(ia_host); auto ja_buf = make_buffer(ja_host); @@ -119,10 +117,9 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, intType reset_nnz = generate_random_matrix<fpType, intType>( format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric); - if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), - a_host.data(), reset_nnz, static_cast<std::size_t>(nrows_A)); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), reset_nnz, + static_cast<std::size_t>(nrows_A)); if (reset_nnz > nnz) { ia_buf = make_buffer(ia_host); ja_buf = make_buffer(ja_host); diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp index 3f09594eb..1db7c7a25 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp @@ -28,15 +28,16 @@ extern std::vector<sycl::device*> devices; namespace { template <typename fpType, typename intType> -int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, intType ncols_A, - intType ncols_C, double density_A_matrix, oneapi::mkl::index_base index, +int test_spmm(sycl::device* dev, sycl::property_list queue_properties, + sparse_matrix_format_t format, intType nrows_A, intType ncols_A, intType ncols_C, + double density_A_matrix, oneapi::mkl::index_base index, oneapi::mkl::layout dense_matrix_layout, oneapi::mkl::transpose transpose_A, oneapi::mkl::transpose transpose_B, fpType alpha, fpType beta, intType ldb, intType ldc, oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::matrix_view A_view, const std::set<oneapi::mkl::sparse::matrix_property>& matrix_properties, bool reset_data, bool test_scalar_on_device) { - sycl::queue main_queue(*dev, exception_handler_t()); + sycl::queue main_queue(*dev, exception_handler_t(), queue_properties); if (require_square_matrix(A_view, matrix_properties)) { ncols_A = nrows_A; @@ -47,8 +48,6 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, auto [opa_nrows, opa_ncols] = swap_if_transposed<std::size_t>(transpose_A, nrows_A, ncols_A); auto [opb_nrows, opb_ncols] = swap_if_transposed<std::int64_t>(transpose_B, opa_ncols, ncols_C); intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1; - const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) != - matrix_properties.cend(); const bool is_symmetric = matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) != matrix_properties.cend(); @@ -69,10 +68,9 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, std::vector<fpType> c_ref_host(c_host); // Shuffle ordering of column indices/values to test sortedness - if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), a_host.data(), nnz, - static_cast<std::size_t>(nrows_A)); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), nnz, + static_cast<std::size_t>(nrows_A)); auto ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size()); auto ja_usm_uptr = malloc_device_uptr<intType>(main_queue, ja_host.size()); @@ -88,26 +86,21 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, fpType* b_usm = b_usm_uptr.get(); fpType* c_usm = c_usm_uptr.get(); - std::vector<sycl::event> mat_dependencies; - std::vector<sycl::event> spmm_dependencies; + std::vector<sycl::event> dependencies; // Copy host to device - mat_dependencies.push_back( + dependencies.push_back( main_queue.memcpy(ia_usm, ia_host.data(), ia_host.size() * sizeof(intType))); - mat_dependencies.push_back( + dependencies.push_back( main_queue.memcpy(ja_usm, ja_host.data(), ja_host.size() * sizeof(intType))); - mat_dependencies.push_back( - main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType))); - spmm_dependencies.push_back( - main_queue.memcpy(b_usm, b_host.data(), b_host.size() * sizeof(fpType))); - spmm_dependencies.push_back( - main_queue.memcpy(c_usm, c_host.data(), c_host.size() * sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(b_usm, b_host.data(), b_host.size() * sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(c_usm, c_host.data(), c_host.size() * sizeof(fpType))); fpType* alpha_host_or_usm_ptr = α fpType* beta_host_or_usm_ptr = β if (test_scalar_on_device) { - spmm_dependencies.push_back( - main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType))); - spmm_dependencies.push_back(main_queue.memcpy(beta_usm_uptr.get(), &beta, sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(beta_usm_uptr.get(), &beta, sizeof(fpType))); alpha_host_or_usm_ptr = alpha_usm_uptr.get(); beta_host_or_usm_ptr = beta_usm_uptr.get(); } @@ -141,24 +134,20 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, sycl::event ev_opt; CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::spmm_optimize, main_queue, transpose_A, transpose_B, &alpha, A_view, A_handle, B_handle, &beta, C_handle, alg, descr, - workspace_usm.get(), mat_dependencies); + workspace_usm.get(), dependencies); - spmm_dependencies.push_back(ev_opt); CALL_RT_OR_CT(ev_spmm = oneapi::mkl::sparse::spmm, main_queue, transpose_A, transpose_B, - &alpha, A_view, A_handle, B_handle, &beta, C_handle, alg, descr, - spmm_dependencies); + &alpha, A_view, A_handle, B_handle, &beta, C_handle, alg, descr, { ev_opt }); if (reset_data) { intType reset_nnz = generate_random_matrix<fpType, intType>( format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric); - if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), - a_host.data(), reset_nnz, static_cast<std::size_t>(nrows_A)); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), reset_nnz, + static_cast<std::size_t>(nrows_A)); + ev_spmm.wait_and_throw(); if (reset_nnz > nnz) { - // Wait before freeing usm pointers - ev_spmm.wait_and_throw(); ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size()); ja_usm_uptr = malloc_device_uptr<intType>(main_queue, ja_host.size()); a_usm_uptr = malloc_device_uptr<fpType>(main_queue, a_host.size()); @@ -168,14 +157,14 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, } nnz = reset_nnz; - mat_dependencies.clear(); - mat_dependencies.push_back(main_queue.memcpy( - ia_usm, ia_host.data(), ia_host.size() * sizeof(intType), ev_spmm)); - mat_dependencies.push_back(main_queue.memcpy( - ja_usm, ja_host.data(), ja_host.size() * sizeof(intType), ev_spmm)); - mat_dependencies.push_back( + dependencies.clear(); + dependencies.push_back(main_queue.memcpy(ia_usm, ia_host.data(), + ia_host.size() * sizeof(intType), ev_spmm)); + dependencies.push_back(main_queue.memcpy(ja_usm, ja_host.data(), + ja_host.size() * sizeof(intType), ev_spmm)); + dependencies.push_back( main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType), ev_spmm)); - mat_dependencies.push_back( + dependencies.push_back( main_queue.memcpy(c_usm, c_host.data(), c_host.size() * sizeof(fpType), ev_spmm)); set_matrix_data(main_queue, format, A_handle, nrows_A, ncols_A, nnz, index, ia_usm, ja_usm, a_usm); @@ -190,7 +179,7 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::spmm_optimize, main_queue, transpose_A, transpose_B, &alpha, A_view, A_handle, B_handle, &beta, C_handle, alg, - descr, workspace_usm.get(), mat_dependencies); + descr, workspace_usm.get(), dependencies); CALL_RT_OR_CT(ev_spmm = oneapi::mkl::sparse::spmm, main_queue, transpose_A, transpose_B, &alpha, A_view, A_handle, B_handle, &beta, C_handle, alg, descr, diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp index 1864f6065..96328372d 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp @@ -28,7 +28,8 @@ extern std::vector<sycl::device*> devices; namespace { template <typename fpType, typename intType> -int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, intType ncols_A, +int test_spmv(sycl::device* dev, sycl::property_list queue_properties, + sparse_matrix_format_t format, intType nrows_A, intType ncols_A, double density_A_matrix, oneapi::mkl::index_base index, oneapi::mkl::transpose transpose_val, fpType alpha, fpType beta, oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::matrix_view A_view, @@ -38,15 +39,13 @@ int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, // Scalars on the device is not planned to be supported with the buffer API return 1; } - sycl::queue main_queue(*dev, exception_handler_t()); + sycl::queue main_queue(*dev, exception_handler_t(), queue_properties); if (require_square_matrix(A_view, matrix_properties)) { ncols_A = nrows_A; } auto [opa_nrows, opa_ncols] = swap_if_transposed<std::size_t>(transpose_val, nrows_A, ncols_A); intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1; - const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) != - matrix_properties.cend(); const bool is_symmetric = matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) != matrix_properties.cend(); @@ -66,10 +65,9 @@ int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, std::vector<fpType> y_ref_host(y_host); // Shuffle ordering of column indices/values to test sortedness - if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), a_host.data(), nnz, - static_cast<std::size_t>(nrows_A)); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), nnz, + static_cast<std::size_t>(nrows_A)); auto ia_buf = make_buffer(ia_host); auto ja_buf = make_buffer(ja_host); @@ -109,10 +107,9 @@ int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, intType reset_nnz = generate_random_matrix<fpType, intType>( format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric); - if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), - a_host.data(), reset_nnz, static_cast<std::size_t>(nrows_A)); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), reset_nnz, + static_cast<std::size_t>(nrows_A)); if (reset_nnz > nnz) { ia_buf = make_buffer(ia_host); ja_buf = make_buffer(ja_host); diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp index b24a6e0ee..c6159aaf4 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp @@ -28,21 +28,20 @@ extern std::vector<sycl::device*> devices; namespace { template <typename fpType, typename intType> -int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, intType ncols_A, +int test_spmv(sycl::device* dev, sycl::property_list queue_properties, + sparse_matrix_format_t format, intType nrows_A, intType ncols_A, double density_A_matrix, oneapi::mkl::index_base index, oneapi::mkl::transpose transpose_val, fpType alpha, fpType beta, oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::matrix_view A_view, const std::set<oneapi::mkl::sparse::matrix_property>& matrix_properties, bool reset_data, bool test_scalar_on_device) { - sycl::queue main_queue(*dev, exception_handler_t()); + sycl::queue main_queue(*dev, exception_handler_t(), queue_properties); if (require_square_matrix(A_view, matrix_properties)) { ncols_A = nrows_A; } auto [opa_nrows, opa_ncols] = swap_if_transposed<std::size_t>(transpose_val, nrows_A, ncols_A); intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1; - const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) != - matrix_properties.cend(); const bool is_symmetric = matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) != matrix_properties.cend(); @@ -62,10 +61,9 @@ int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, std::vector<fpType> y_ref_host(y_host); // Shuffle ordering of column indices/values to test sortedness - if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), a_host.data(), nnz, - static_cast<std::size_t>(nrows_A)); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), nnz, + static_cast<std::size_t>(nrows_A)); auto ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size()); auto ja_usm_uptr = malloc_device_uptr<intType>(main_queue, ja_host.size()); @@ -81,26 +79,21 @@ int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, fpType* x_usm = x_usm_uptr.get(); fpType* y_usm = y_usm_uptr.get(); - std::vector<sycl::event> mat_dependencies; - std::vector<sycl::event> spmv_dependencies; + std::vector<sycl::event> dependencies; // Copy host to device - mat_dependencies.push_back( + dependencies.push_back( main_queue.memcpy(ia_usm, ia_host.data(), ia_host.size() * sizeof(intType))); - mat_dependencies.push_back( + dependencies.push_back( main_queue.memcpy(ja_usm, ja_host.data(), ja_host.size() * sizeof(intType))); - mat_dependencies.push_back( - main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType))); - spmv_dependencies.push_back( - main_queue.memcpy(x_usm, x_host.data(), x_host.size() * sizeof(fpType))); - spmv_dependencies.push_back( - main_queue.memcpy(y_usm, y_host.data(), y_host.size() * sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(x_usm, x_host.data(), x_host.size() * sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(y_usm, y_host.data(), y_host.size() * sizeof(fpType))); fpType* alpha_host_or_usm_ptr = α fpType* beta_host_or_usm_ptr = β if (test_scalar_on_device) { - spmv_dependencies.push_back( - main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType))); - spmv_dependencies.push_back(main_queue.memcpy(beta_usm_uptr.get(), &beta, sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(beta_usm_uptr.get(), &beta, sizeof(fpType))); alpha_host_or_usm_ptr = alpha_usm_uptr.get(); beta_host_or_usm_ptr = beta_usm_uptr.get(); } @@ -133,24 +126,21 @@ int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, sycl::event ev_opt; CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::spmv_optimize, main_queue, transpose_val, alpha_host_or_usm_ptr, A_view, A_handle, x_handle, beta_host_or_usm_ptr, - y_handle, alg, descr, workspace_usm.get(), mat_dependencies); + y_handle, alg, descr, workspace_usm.get(), dependencies); - spmv_dependencies.push_back(ev_opt); CALL_RT_OR_CT(ev_spmv = oneapi::mkl::sparse::spmv, main_queue, transpose_val, alpha_host_or_usm_ptr, A_view, A_handle, x_handle, beta_host_or_usm_ptr, - y_handle, alg, descr, spmv_dependencies); + y_handle, alg, descr, { ev_opt }); if (reset_data) { intType reset_nnz = generate_random_matrix<fpType, intType>( format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric); - if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), - a_host.data(), reset_nnz, static_cast<std::size_t>(nrows_A)); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), reset_nnz, + static_cast<std::size_t>(nrows_A)); + ev_spmv.wait_and_throw(); if (reset_nnz > nnz) { - // Wait before freeing usm pointers - ev_spmv.wait_and_throw(); ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size()); ja_usm_uptr = malloc_device_uptr<intType>(main_queue, ja_host.size()); a_usm_uptr = malloc_device_uptr<fpType>(main_queue, a_host.size()); @@ -160,14 +150,14 @@ int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, } nnz = reset_nnz; - mat_dependencies.clear(); - mat_dependencies.push_back(main_queue.memcpy( - ia_usm, ia_host.data(), ia_host.size() * sizeof(intType), ev_spmv)); - mat_dependencies.push_back(main_queue.memcpy( - ja_usm, ja_host.data(), ja_host.size() * sizeof(intType), ev_spmv)); - mat_dependencies.push_back( + dependencies.clear(); + dependencies.push_back(main_queue.memcpy(ia_usm, ia_host.data(), + ia_host.size() * sizeof(intType), ev_spmv)); + dependencies.push_back(main_queue.memcpy(ja_usm, ja_host.data(), + ja_host.size() * sizeof(intType), ev_spmv)); + dependencies.push_back( main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType), ev_spmv)); - mat_dependencies.push_back( + dependencies.push_back( main_queue.memcpy(y_usm, y_host.data(), y_host.size() * sizeof(fpType), ev_spmv)); set_matrix_data(main_queue, format, A_handle, nrows_A, ncols_A, nnz, index, ia_usm, ja_usm, a_usm); @@ -182,7 +172,7 @@ int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::spmv_optimize, main_queue, transpose_val, alpha_host_or_usm_ptr, A_view, A_handle, x_handle, beta_host_or_usm_ptr, - y_handle, alg, descr, workspace_usm.get(), mat_dependencies); + y_handle, alg, descr, workspace_usm.get(), dependencies); CALL_RT_OR_CT(ev_spmv = oneapi::mkl::sparse::spmv, main_queue, transpose_val, alpha_host_or_usm_ptr, A_view, A_handle, x_handle, beta_host_or_usm_ptr, diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp index ca5689d13..19c237dc0 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp @@ -28,7 +28,8 @@ extern std::vector<sycl::device*> devices; namespace { template <typename fpType, typename intType> -int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, double density_A_matrix, +int test_spsv(sycl::device* dev, sycl::property_list queue_properties, + sparse_matrix_format_t format, intType m, double density_A_matrix, oneapi::mkl::index_base index, oneapi::mkl::transpose transpose_val, fpType alpha, oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::matrix_view A_view, const std::set<oneapi::mkl::sparse::matrix_property>& matrix_properties, @@ -37,16 +38,17 @@ int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, doubl // Scalars on the device is not planned to be supported with the buffer API return 1; } - sycl::queue main_queue(*dev, exception_handler_t()); + sycl::queue main_queue(*dev, exception_handler_t(), queue_properties); intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1; const std::size_t mu = static_cast<std::size_t>(m); - const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) != - matrix_properties.cend(); const bool is_symmetric = matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) != matrix_properties.cend(); + // Use a fixed seed for operations very sensitive to the input data + std::srand(1); + // Input matrix std::vector<intType> ia_host, ja_host; std::vector<fpType> a_host; @@ -69,10 +71,8 @@ int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, doubl std::vector<fpType> y_ref_host(y_host); // Shuffle ordering of column indices/values to test sortedness - if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), a_host.data(), nnz, - mu); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), nnz, mu); auto ia_buf = make_buffer(ia_host); auto ja_buf = make_buffer(ja_host); @@ -109,10 +109,8 @@ int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, doubl intType reset_nnz = generate_random_matrix<fpType, intType>( format, m, m, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric, require_diagonal); - if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), - a_host.data(), reset_nnz, mu); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), reset_nnz, mu); if (reset_nnz > nnz) { ia_buf = make_buffer(ia_host); ja_buf = make_buffer(ja_host); diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp index 7a43a7112..68023591b 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp @@ -28,21 +28,23 @@ extern std::vector<sycl::device*> devices; namespace { template <typename fpType, typename intType> -int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, double density_A_matrix, +int test_spsv(sycl::device* dev, sycl::property_list queue_properties, + sparse_matrix_format_t format, intType m, double density_A_matrix, oneapi::mkl::index_base index, oneapi::mkl::transpose transpose_val, fpType alpha, oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::matrix_view A_view, const std::set<oneapi::mkl::sparse::matrix_property>& matrix_properties, bool reset_data, bool test_scalar_on_device) { - sycl::queue main_queue(*dev, exception_handler_t()); + sycl::queue main_queue(*dev, exception_handler_t(), queue_properties); intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1; const std::size_t mu = static_cast<std::size_t>(m); - const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) != - matrix_properties.cend(); const bool is_symmetric = matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) != matrix_properties.cend(); + // Use a fixed seed for operations very sensitive to the input data + std::srand(1); + // Input matrix std::vector<intType> ia_host, ja_host; std::vector<fpType> a_host; @@ -65,10 +67,8 @@ int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, doubl std::vector<fpType> y_ref_host(y_host); // Shuffle ordering of column indices/values to test sortedness - if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), a_host.data(), nnz, - mu); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), nnz, mu); auto ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size()); auto ja_usm_uptr = malloc_device_uptr<intType>(main_queue, ja_host.size()); @@ -83,24 +83,19 @@ int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, doubl fpType* x_usm = x_usm_uptr.get(); fpType* y_usm = y_usm_uptr.get(); - std::vector<sycl::event> mat_dependencies; - std::vector<sycl::event> spsv_dependencies; + std::vector<sycl::event> dependencies; // Copy host to device - mat_dependencies.push_back( + dependencies.push_back( main_queue.memcpy(ia_usm, ia_host.data(), ia_host.size() * sizeof(intType))); - mat_dependencies.push_back( + dependencies.push_back( main_queue.memcpy(ja_usm, ja_host.data(), ja_host.size() * sizeof(intType))); - mat_dependencies.push_back( - main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType))); - spsv_dependencies.push_back( - main_queue.memcpy(x_usm, x_host.data(), x_host.size() * sizeof(fpType))); - spsv_dependencies.push_back( - main_queue.memcpy(y_usm, y_host.data(), y_host.size() * sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(x_usm, x_host.data(), x_host.size() * sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(y_usm, y_host.data(), y_host.size() * sizeof(fpType))); fpType* alpha_host_or_usm_ptr = α if (test_scalar_on_device) { - spsv_dependencies.push_back( - main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType))); alpha_host_or_usm_ptr = alpha_usm_uptr.get(); } @@ -129,24 +124,20 @@ int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, doubl sycl::event ev_opt; CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::spsv_optimize, main_queue, transpose_val, alpha_host_or_usm_ptr, A_view, A_handle, x_handle, y_handle, alg, descr, - workspace_usm.get(), mat_dependencies); + workspace_usm.get(), dependencies); - spsv_dependencies.push_back(ev_opt); CALL_RT_OR_CT(ev_spsv = oneapi::mkl::sparse::spsv, main_queue, transpose_val, alpha_host_or_usm_ptr, A_view, A_handle, x_handle, y_handle, alg, descr, - spsv_dependencies); + { ev_opt }); if (reset_data) { intType reset_nnz = generate_random_matrix<fpType, intType>( format, m, m, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric, require_diagonal); - if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), - a_host.data(), reset_nnz, mu); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), reset_nnz, mu); + ev_spsv.wait_and_throw(); if (reset_nnz > nnz) { - // Wait before freeing usm pointers - ev_spsv.wait_and_throw(); ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size()); ja_usm_uptr = malloc_device_uptr<intType>(main_queue, ja_host.size()); a_usm_uptr = malloc_device_uptr<fpType>(main_queue, a_host.size()); @@ -156,14 +147,14 @@ int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, doubl } nnz = reset_nnz; - mat_dependencies.clear(); - mat_dependencies.push_back(main_queue.memcpy( - ia_usm, ia_host.data(), ia_host.size() * sizeof(intType), ev_spsv)); - mat_dependencies.push_back(main_queue.memcpy( - ja_usm, ja_host.data(), ja_host.size() * sizeof(intType), ev_spsv)); - mat_dependencies.push_back( + dependencies.clear(); + dependencies.push_back(main_queue.memcpy(ia_usm, ia_host.data(), + ia_host.size() * sizeof(intType), ev_spsv)); + dependencies.push_back(main_queue.memcpy(ja_usm, ja_host.data(), + ja_host.size() * sizeof(intType), ev_spsv)); + dependencies.push_back( main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType), ev_spsv)); - mat_dependencies.push_back( + dependencies.push_back( main_queue.memcpy(y_usm, y_host.data(), y_host.size() * sizeof(fpType), ev_spsv)); set_matrix_data(main_queue, format, A_handle, m, m, nnz, index, ia_usm, ja_usm, a_usm); @@ -177,7 +168,7 @@ int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, doubl CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::spsv_optimize, main_queue, transpose_val, alpha_host_or_usm_ptr, A_view, A_handle, x_handle, y_handle, alg, descr, - workspace_usm.get(), mat_dependencies); + workspace_usm.get(), dependencies); CALL_RT_OR_CT(ev_spsv = oneapi::mkl::sparse::spsv, main_queue, transpose_val, alpha_host_or_usm_ptr, A_view, A_handle, x_handle, y_handle, alg, descr, From bd5d47bd4be71fce665a5b97f40448cd5f6cc7a0 Mon Sep 17 00:00:00 2001 From: Romain Biessy <romain.biessy@codeplay.com> Date: Wed, 30 Oct 2024 18:21:53 +0100 Subject: [PATCH 2/2] [DFT] Reword comment (#607) --- src/dft/backends/mklgpu/commit.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dft/backends/mklgpu/commit.cpp b/src/dft/backends/mklgpu/commit.cpp index bae1eb69f..8405c3891 100644 --- a/src/dft/backends/mklgpu/commit.cpp +++ b/src/dft/backends/mklgpu/commit.cpp @@ -43,7 +43,7 @@ #include <oneapi/mkl/dft.hpp> #endif -// MKL 2024.1 deprecates input/output strides. +// Intel oneMKL 2024.1 deprecates input/output strides. #if INTEL_MKL_VERSION < 20240001 #error MKLGPU requires oneMKL 2024.1 or later #endif