From c8dc9a90157a20e75b3b76fafdac0053ba996cf9 Mon Sep 17 00:00:00 2001
From: Romain Biessy <>
Date: Tue, 29 Oct 2024 10:48:44 +0100
Subject: [PATCH 1/2] [SPARSE] Add support for cuSPARSE backend (#527)

 CMakeLists.txt                                |   8 +-                                     |  20 +-
 cmake/FindCompiler.cmake                      |   4 +-
 docs/building_the_project_with_dpcpp.rst      |   8 +-
 docs/domains/sparse_linear_algebra.rst        | 139 ++++-
 examples/                            | 124 ++---
 .../compile_time_dispatching/CMakeLists.txt   |  21 +-
 ... sparse_blas_spmv_usm_mklcpu_cusparse.cpp} | 188 +++----
 .../run_time_dispatching/CMakeLists.txt       |   3 +
 include/oneapi/mkl/detail/backends.hpp        |  29 +-
 include/oneapi/mkl/detail/backends_table.hpp  |   6 +
 include/oneapi/mkl/sparse_blas.hpp            |   3 +
 .../cusparse/onemkl_sparse_blas_cusparse.hpp  |  33 ++
 .../detail/cusparse/sparse_blas_ct.hpp        |  40 ++
 include/oneapi/mkl/sparse_blas/types.hpp      |   1 +
 src/CMakeLists.txt                            |   1 +
 src/                             |   1 +
 src/sparse_blas/backends/CMakeLists.txt       |   4 +
 .../backends/cusparse/CMakeLists.txt          |  85 +++
 .../backends/cusparse/cusparse_error.hpp      | 103 ++++
 .../cusparse/cusparse_global_handle.hpp       |  63 +++
 .../backends/cusparse/cusparse_handles.cpp    | 485 ++++++++++++++++++
 .../backends/cusparse/cusparse_handles.hpp    |  95 ++++
 .../backends/cusparse/cusparse_helper.hpp     | 166 ++++++
 .../cusparse/cusparse_scope_handle.cpp        | 147 ++++++
 .../cusparse/cusparse_scope_handle.hpp        |  88 ++++
 .../backends/cusparse/cusparse_task.hpp       | 431 ++++++++++++++++
 .../backends/cusparse/cusparse_wrappers.cpp   |  32 ++
 .../cusparse/operations/cusparse_spmm.cpp     | 336 ++++++++++++
 .../cusparse/operations/cusparse_spmv.cpp     | 335 ++++++++++++
 .../cusparse/operations/cusparse_spsv.cpp     | 289 +++++++++++
 .../backends/mkl_common/mkl_dispatch.hpp      |  37 ++
 .../backends/mkl_common/mkl_handles.cxx       | 217 +++-----
 .../backends/mkl_common/mkl_handles.hpp       |   2 +
 .../backends/mkl_common/mkl_helper.hpp        | 111 ----
 .../backends/mkl_common/mkl_spmm.cxx          | 122 ++---
 .../backends/mkl_common/mkl_spmv.cxx          | 119 ++---
 .../backends/mkl_common/mkl_spsv.cxx          |  98 ++--
 .../backends/mklcpu/mklcpu_handles.cpp        |   2 +-
 .../backends/mklcpu/mklcpu_operations.cpp     |   4 +-
 .../backends/mklgpu/mklgpu_handles.cpp        |   2 +-
 .../backends/mklgpu/mklgpu_operations.cpp     |   4 +-
 src/sparse_blas/common_op_verification.hpp    | 137 +++++
 src/sparse_blas/generic_container.hpp         |  94 +++-
 src/sparse_blas/macros.hpp                    |  81 +++
 src/sparse_blas/sycl_helper.hpp               |  80 +++
 tests/unit_tests/CMakeLists.txt               |   5 +
 tests/unit_tests/include/test_helper.hpp      |  10 +
 tests/unit_tests/main_test.cpp                |   3 +-
 .../sparse_blas/include/test_common.hpp       |  93 +++-
 .../sparse_blas/include/test_spmm.hpp         | 138 ++---
 .../sparse_blas/include/test_spmv.hpp         | 130 ++---
 .../sparse_blas/include/test_spsv.hpp         | 102 ++--
 .../sparse_blas/source/sparse_spmm_buffer.cpp |  23 +-
 .../sparse_blas/source/sparse_spmm_usm.cpp    |  69 ++-
 .../sparse_blas/source/sparse_spmv_buffer.cpp |  21 +-
 .../sparse_blas/source/sparse_spmv_usm.cpp    |  66 +--
 .../sparse_blas/source/sparse_spsv_buffer.cpp |  22 +-
 .../sparse_blas/source/sparse_spsv_usm.cpp    |  65 +--
 59 files changed, 4113 insertions(+), 1032 deletions(-)
 rename examples/sparse_blas/compile_time_dispatching/{sparse_blas_spmv_usm_mklcpu.cpp => sparse_blas_spmv_usm_mklcpu_cusparse.cpp} (55%)
 create mode 100644 include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp
 create mode 100644 include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp
 create mode 100644 src/sparse_blas/backends/cusparse/CMakeLists.txt
 create mode 100644 src/sparse_blas/backends/cusparse/cusparse_error.hpp
 create mode 100644 src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp
 create mode 100644 src/sparse_blas/backends/cusparse/cusparse_handles.cpp
 create mode 100644 src/sparse_blas/backends/cusparse/cusparse_handles.hpp
 create mode 100644 src/sparse_blas/backends/cusparse/cusparse_helper.hpp
 create mode 100644 src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp
 create mode 100644 src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp
 create mode 100644 src/sparse_blas/backends/cusparse/cusparse_task.hpp
 create mode 100644 src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp
 create mode 100644 src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp
 create mode 100644 src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp
 create mode 100644 src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp
 create mode 100644 src/sparse_blas/backends/mkl_common/mkl_dispatch.hpp
 delete mode 100644 src/sparse_blas/backends/mkl_common/mkl_helper.hpp
 create mode 100644 src/sparse_blas/common_op_verification.hpp
 create mode 100644 src/sparse_blas/sycl_helper.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c4cad8e17..76f5aedc2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,9 @@ option(ENABLE_CUFFT_BACKEND "Enable the cuFFT backend for the DFT interface" OFF
 option(ENABLE_ROCFFT_BACKEND "Enable the rocFFT backend for the DFT interface" OFF)
 option(ENABLE_PORTFFT_BACKEND "Enable the portFFT DFT backend for the DFT interface. Cannot be used with other DFT backends." OFF)
+# sparse
+option(ENABLE_CUSPARSE_BACKEND "Enable the cuSPARSE backend for the SPARSE_BLAS interface" OFF)
 set(ONEMKL_SYCL_IMPLEMENTATION "dpc++" CACHE STRING "Name of the SYCL compiler")
 set(HIP_TARGETS "" CACHE STRING "Target HIP architectures")
@@ -102,7 +105,8 @@ if(ENABLE_MKLGPU_BACKEND
   list(APPEND DOMAINS_LIST "dft")
   list(APPEND DOMAINS_LIST "sparse_blas")
     set(CMAKE_CXX_COMPILER "clang++")
diff --git a/ b/
index 5dc8c9c3b..dc023c67c 100644
--- a/
+++ b/
@@ -18,8 +18,8 @@ oneMKL is part of the [UXL Foundation](
-            <td rowspan=12 align="center">oneMKL interface</td>
-            <td rowspan=12 align="center">oneMKL selector</td>
+            <td rowspan=13 align="center">oneMKL interface</td>
+            <td rowspan=13 align="center">oneMKL selector</td>
             <td align="center"><a href="">Intel(R) oneAPI Math Kernel Library (oneMKL)</a></td>
             <td align="center">x86 CPU, Intel GPU</td>
@@ -28,10 +28,10 @@ oneMKL is part of the [UXL Foundation](
             <td align="center"><a href=""> NVIDIA cuBLAS</a></td>
             <td align="center">NVIDIA GPU</td>
-	<tr>
+        <tr>
             <td align="center"><a href=""> NVIDIA cuSOLVER</a></td>
             <td align="center">NVIDIA GPU</td>
-	</tr>
+        </tr>
             <td align="center"><a href=""> NVIDIA cuRAND</a></td>
             <td align="center">NVIDIA GPU</td>
@@ -40,6 +40,10 @@ oneMKL is part of the [UXL Foundation](
             <td align="center"><a href=""> NVIDIA cuFFT</a></td>
             <td align="center">NVIDIA GPU</td>
+        <tr>
+            <td align="center"><a href=""> NVIDIA cuSPARSE</a></td>
+            <td align="center">NVIDIA GPU</td>
+        </tr>
             <td align="center"><a href=""> NETLIB LAPACK</a> </td>
             <td align="center">x86 CPU</td>
@@ -329,7 +333,7 @@ Supported compilers include:
             <td align="center">Dynamic, Static</td>
-            <td rowspan=2 align="center">SPARSE_BLAS</td>
+            <td rowspan=3 align="center">SPARSE_BLAS</td>
             <td align="center">x86 CPU</td>
             <td align="center">Intel(R) oneMKL</td>
             <td align="center">Intel DPC++</td>
@@ -341,6 +345,12 @@ Supported compilers include:
             <td align="center">Intel DPC++</td>
             <td align="center">Dynamic, Static</td>
+        <tr>
+            <td align="center">NVIDIA GPU</td>
+            <td align="center">NVIDIA cuSPARSE</td>
+            <td align="center">Open DPC++</td>
+            <td align="center">Dynamic, Static</td>
+        </tr>
diff --git a/cmake/FindCompiler.cmake b/cmake/FindCompiler.cmake
index 556211999..8aefc2623 100644
--- a/cmake/FindCompiler.cmake
+++ b/cmake/FindCompiler.cmake
@@ -37,7 +37,7 @@ if(is_dpcpp)
     # Check if the Nvidia target is supported. PortFFT uses this for choosing default configuration.
     check_cxx_compiler_flag("-fsycl -fsycl-targets=nvptx64-nvidia-cuda" dpcpp_supports_nvptx64)
         -fsycl-targets=nvptx64-nvidia-cuda -fsycl-unnamed-lambda)
@@ -51,7 +51,7 @@ if(is_dpcpp)
         -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend 
       set_target_properties(ONEMKL::SYCL::SYCL PROPERTIES
diff --git a/docs/building_the_project_with_dpcpp.rst b/docs/building_the_project_with_dpcpp.rst
index 6076117f7..efe92f285 100644
--- a/docs/building_the_project_with_dpcpp.rst
+++ b/docs/building_the_project_with_dpcpp.rst
@@ -104,6 +104,9 @@ The most important supported build options are:
      - True, False
      - False     
+     - True, False
+     - False     
      - True, False
      - False     
@@ -183,8 +186,8 @@ Building for CUDA
 The CUDA backends can be enabled with ``ENABLE_CUBLAS_BACKEND``,
 No additional parameters are required for using CUDA libraries. In most cases,
 the CUDA libraries should be found automatically by CMake.
@@ -371,6 +374,7 @@ disabled using the Ninja build system:
 ``$ONEMKL_DIR`` points at the oneMKL source directly. The x86 CPU (``MKLCPU``)
diff --git a/docs/domains/sparse_linear_algebra.rst b/docs/domains/sparse_linear_algebra.rst
index eab5afd56..07d90359a 100644
--- a/docs/domains/sparse_linear_algebra.rst
+++ b/docs/domains/sparse_linear_algebra.rst
@@ -20,21 +20,150 @@ Currently known limitations:
 - ``oneapi::mkl::sparse::set_csr_data`` and
   ``oneapi::mkl::sparse::set_coo_data`` functions cannot be used on a handle
   that has already been used for an operation or its optimize function. Doing so
-  will throw an ``oneapi::mkl::unimplemented`` exception.
+  will throw a ``oneapi::mkl::unimplemented`` exception.
 - Using ``spsv`` with the ``oneapi::mkl::sparse::spsv_alg::no_optimize_alg`` and
   a sparse matrix that does not have the
-  ``oneapi::mkl::sparse::matrix_property::sorted`` property will throw an
+  ``oneapi::mkl::sparse::matrix_property::sorted`` property will throw a
   ``oneapi::mkl::unimplemented`` exception.
 - Using ``spmm`` on Intel GPU with a sparse matrix that is
   ``oneapi::mkl::transpose::conjtrans`` and has the
-  ``oneapi::mkl::sparse::matrix_property::symmetric`` property will throw an
+  ``oneapi::mkl::sparse::matrix_property::symmetric`` property will throw a
   ``oneapi::mkl::unimplemented`` exception.
 - Using ``spmv`` with a sparse matrix that is
   ``oneapi::mkl::transpose::conjtrans`` with a ``type_view``
-  ``matrix_descr::symmetric`` or ``matrix_descr::hermitian`` will throw an
+  ``matrix_descr::symmetric`` or ``matrix_descr::hermitian`` will throw a
   ``oneapi::mkl::unimplemented`` exception.
 - Using ``spsv`` on Intel GPU with a sparse matrix that is
-  ``oneapi::mkl::transpose::conjtrans`` and will throw an
+  ``oneapi::mkl::transpose::conjtrans`` and will throw a
   ``oneapi::mkl::unimplemented`` exception.
 - Scalar parameters ``alpha`` and ``beta`` should be host pointers to prevent
   synchronizations and copies to the host.
+cuSPARSE backend
+Currently known limitations:
+- The COO format requires the indices to be sorted by row. See the `cuSPARSE
+  documentation
+  <>`_. Sparse
+  operations using matrices with the COO format without the property
+  ``matrix_property::sorted_by_rows`` or ``matrix_property::sorted`` will throw
+  a ``oneapi::mkl::unimplemented`` exception.
+- Using ``spmm`` with the algorithm ``spmm_alg::csr_alg3`` and an ``opA`` other
+  than ``transpose::nontrans`` or an ``opB`` ``transpose::conjtrans`` will throw
+  a ``oneapi::mkl::unimplemented`` exception.
+- Using ``spmm`` with the algorithm ``spmm_alg::csr_alg3``,
+  ``opB=transpose::trans`` and real fp64 precision will throw a
+  ``oneapi::mkl::unimplemented`` exception. This configuration can fail as of
+  CUDA 12.6.2, see the related issue
+  `here<>`_.
+- Using ``spmv`` with a ``type_view`` other than ``matrix_descr::general`` will
+  throw a ``oneapi::mkl::unimplemented`` exception.
+- Using ``spsv`` with the algorithm ``spsv_alg::no_optimize_alg`` may still
+  perform some mandatory preprocessing.
+- oneMKL Interface does not provide a way to use non-default algorithms without
+  calling preprocess functions such as ``cusparseSpMM_preprocess`` or
+  ``cusparseSpMV_preprocess``. Feel free to create an issue if this is needed.
+Operation algorithms mapping
+The following tables describe how a oneMKL SYCL Interface algorithm maps to the
+backend's algorithms. Refer to the backend's documentation for a more detailed
+explanation of the algorithms.
+Backends with no equivalent algorithms will fallback to the backend's default
+.. list-table::
+   :header-rows: 1
+   :widths: 10 30 45
+   * - ``spmm_alg`` value
+     - cuSPARSE
+   * - ``default_alg``
+     - none
+   * - ``no_optimize_alg``
+     - none
+   * - ``coo_alg1``
+     - none
+   * - ``coo_alg2``
+     - none
+   * - ``coo_alg3``
+     - none
+   * - ``coo_alg4``
+     - none
+   * - ``csr_alg1``
+     - none
+   * - ``csr_alg2``
+     - none
+   * - ``csr_alg3``
+     - none
+.. list-table::
+   :header-rows: 1
+   :widths: 10 30 45
+   * - ``spmv_alg`` value
+     - cuSPARSE
+   * - ``default_alg``
+     - none
+   * - ``no_optimize_alg``
+     - none
+   * - ``coo_alg1``
+     - none
+   * - ``coo_alg2``
+     - none
+   * - ``csr_alg1``
+     - none
+   * - ``csr_alg2``
+     - none
+   * - ``csr_alg3``
+     - none
+.. list-table::
+   :header-rows: 1
+   :widths: 10 30 45
+   * - ``spsv_alg`` value
+     - cuSPARSE
+   * - ``default_alg``
+     - none
+   * - ``no_optimize_alg``
+     - none
diff --git a/examples/ b/examples/
index 0dad8772d..45a100131 100644
--- a/examples/
+++ b/examples/
@@ -4,7 +4,7 @@ oneAPI Math Kernel Library (oneMKL) Interfaces offers examples with the followin
 - rng: uniform_usm  
 - lapack: getrs_usm
 - dft: complex_fwd_usm, real_fwd_usm
-- sparse_blas: sparse_gemv_usm
+- sparse_blas: sparse_spmv_usm
 Each routine has one run-time dispatching example and one compile-time dispatching example (which uses both mklcpu and cuda backends), located in `example/<$domain>/run_time_dispatching` and `example/<$domain>/compile_time_dispatching` subfolders, respectively.
@@ -487,111 +487,119 @@ Unsupported Configuration:
 Run-time dispatching examples with mklcpu backend
 $ export ONEAPI_DEVICE_SELECTOR="opencl:cpu"
-$ ./bin/example_sparse_blas_gemv_usm
+$ ./bin/example_sparse_blas_spmv_usm
-# Sparse Matrix-Vector Multiply Example: 
+# Sparse Matrix-Vector Multiply Example:
 # y = alpha * op(A) * x + beta * y
 # where A is a sparse matrix in CSR format, x and y are dense vectors
 # and alpha, beta are floating point type precision scalars.
 # Using apis:
-#   sparse::gemv
+#   sparse::spmv
 # Using single precision (float) data type
 # Device will be selected during runtime.
 # The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify
 # available devices
-Running Sparse BLAS GEMV USM example on CPU device.
-Device name is: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz
+Running Sparse BLAS SPMV USM example on CPU device.
+Device name is: Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz
 Running with single precision real data type:
-		sparse::gemv parameters:
-			transA = nontrans
-			nrows = 64
-			alpha = 1, beta = 0
+                sparse::spmv parameters:
+                        transA = nontrans
+                        nrows = 64
+                        alpha = 1, beta = 0
-		 sparse::gemv example passed
-	Finished
-Sparse BLAS GEMV USM example ran OK.
+                 sparse::spmv example passed
+        Finished
+Sparse BLAS SPMV USM example ran OK.
 Run-time dispatching examples with mklgpu backend
 $ export ONEAPI_DEVICE_SELECTOR="level_zero:gpu"
-$ ./bin/example_sparse_blas_gemv_usm
+$ ./bin/example_sparse_blas_spmv_usm
-# Sparse Matrix-Vector Multiply Example: 
+# Sparse Matrix-Vector Multiply Example:
 # y = alpha * op(A) * x + beta * y
 # where A is a sparse matrix in CSR format, x and y are dense vectors
 # and alpha, beta are floating point type precision scalars.
 # Using apis:
-#   sparse::gemv
+#   sparse::spmv
 # Using single precision (float) data type
 # Device will be selected during runtime.
 # The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify
 # available devices
-Running Sparse BLAS GEMV USM example on GPU device.
+Running Sparse BLAS SPMV USM example on GPU device.
 Device name is: Intel(R) HD Graphics 530 [0x1912]
 Running with single precision real data type:
-		sparse::gemv parameters:
-			transA = nontrans
-			nrows = 64
-			alpha = 1, beta = 0
+                sparse::spmv parameters:
+                        transA = nontrans
+                        nrows = 64
+                        alpha = 1, beta = 0
-		 sparse::gemv example passed
-	Finished
-Sparse BLAS GEMV USM example ran OK.
+                 sparse::spmv example passed
+        Finished
+Sparse BLAS SPMV USM example ran OK.
-Compile-time dispatching example with mklcpu backend
+Compile-time dispatching example with both mklcpu and cusparse backend
-$ export ONEAPI_DEVICE_SELECTOR="opencl:cpu"
-$ ./bin/example_sparse_blas_gemv_usm_mklcpu
+$ ./bin/sparse_blas_spmv_usm_mklcpu_cusparse
-# Sparse Matrix-Vector Multiply Example: 
+# Sparse Matrix-Vector Multiply Example:
 # y = alpha * op(A) * x + beta * y
-# where A is a sparse matrix in CSR format, x and y are dense vectors
+# where A is a sparse matrix in COO format, x and y are dense vectors
 # and alpha, beta are floating point type precision scalars.
 # Using apis:
-#   sparse::gemv
+#   sparse::spmv
 # Using single precision (float) data type
-# Running on Intel CPU device
+# Running on both Intel CPU and Nvidia GPU devices
-Running Sparse BLAS GEMV USM example on CPU device.
-Device name is: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz
+Running Sparse BLAS SPMV USM example on:
+        CPU device: Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz
+        GPU device: NVIDIA A100-PCIE-40GB
 Running with single precision real data type:
-		sparse::gemv parameters:
-			transA = nontrans
-			nrows = 64
-			alpha = 1, beta = 0
+                sparse::spmv parameters:
+                        transA = nontrans
+                        size = 8
+                        alpha = 1, beta = 0
+                 sparse::spmv example passed
+        Finished
+                sparse::spmv parameters:
+                        transA = nontrans
+                        size = 8
+                        alpha = 1, beta = 0
-		 sparse::gemv example passed
-	Finished
-Sparse BLAS GEMV USM example ran OK.
+                 sparse::spmv example passed
+        Finished
+Sparse BLAS SPMV USM example ran OK on MKLCPU and CUSPARSE.
diff --git a/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt b/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt
index 5dbbba8a4..a38f4ebd4 100644
--- a/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt
+++ b/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt
@@ -18,27 +18,24 @@
 #Build object from all sources
+  list(APPEND SPARSE_CT_SOURCES "sparse_blas_spmv_usm_mklcpu_cusparse")
-foreach(backend ${SPARSE_BLAS_BACKENDS})
-  set(EXAMPLE_NAME example_sparse_blas_spmv_usm_${backend})
-  add_executable(${EXAMPLE_NAME} sparse_blas_spmv_usm_${backend}.cpp)
-  target_include_directories(${EXAMPLE_NAME}
+foreach(sparse_ct_source ${SPARSE_CT_SOURCES})
+  add_executable(${sparse_ct_source} ${sparse_ct_source}.cpp)
+  target_include_directories(${sparse_ct_source}
       PUBLIC ${PROJECT_SOURCE_DIR}/examples/include
-  add_dependencies(${EXAMPLE_NAME} onemkl_sparse_blas_${backend})
-  target_link_libraries(${EXAMPLE_NAME} PRIVATE ONEMKL::SYCL::SYCL onemkl_sparse_blas_${backend})
+  target_link_libraries(${sparse_ct_source} PRIVATE ONEMKL::SYCL::SYCL onemkl_sparse_blas_mklcpu onemkl_sparse_blas_cusparse)
   # Register example as ctest
-  add_test(NAME sparse_blas/EXAMPLE/CT/sparse_blas_spmv_usm_${backend} COMMAND ${EXAMPLE_NAME})
+  add_test(NAME sparse_blas/EXAMPLE/CT/${sparse_ct_source} COMMAND ${sparse_ct_source})
diff --git a/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu.cpp b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp
similarity index 55%
rename from examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu.cpp
rename to examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp
index 964afb49b..31ce1975c 100644
--- a/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu.cpp
+++ b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp
@@ -22,7 +22,7 @@
 *  Content:
 *       This example demonstrates use of DPCPP API oneapi::mkl::sparse::spmv
 *       using unified shared memory to perform general sparse matrix-vector
-*       multiplication on a INTEL CPU SYCL device.
+*       multiplication on a INTEL CPU SYCL device and an NVIDIA GPU SYCL device.
 *       y = alpha * op(A) * x + beta * y
@@ -59,69 +59,54 @@
 // is performed and finally the results are post processed.
-template <typename fp, typename intType>
-int run_sparse_matrix_vector_multiply_example(const sycl::device& cpu_dev) {
+template <typename fpType, typename intType, typename selectorType>
+int run_sparse_matrix_vector_multiply_example(selectorType& selector) {
+    auto queue = selector.get_queue();
     // Matrix data size
-    intType size = 4;
-    intType nrows = size * size * size;
+    static constexpr intType size = 8;
-    // Set scalar fp values
-    fp alpha = set_fp_value(fp(1.0));
-    fp beta = set_fp_value(fp(0.0));
+    // Set scalar fpType values
+    fpType alpha = set_fp_value(fpType(1.0));
+    fpType beta = set_fp_value(fpType(0.0));
-    // Catch asynchronous exceptions
-    auto exception_handler = [](sycl::exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (sycl::exception const& e) {
-                std::cout << "Caught asynchronous SYCL "
-                             "exception during sparse::spmv:\n"
-                          << e.what() << std::endl;
-            }
-        }
-    };
+    intType nnz = 9;
+    // host_ia must be sorted to maintain the sorted_by_rows property
+    intType host_ia[] = { 0, 0, 1, 3, 4, 4, 4, 7, 7 };
+    intType host_ja[] = { 0, 7, 2, 2, 5, 4, 0, 0, 7 };
+    intType* ia = (intType*)sycl::malloc_shared(nnz * sizeof(intType), queue);
+    intType* ja = (intType*)sycl::malloc_shared(nnz * sizeof(intType), queue);
+    fpType* a = (fpType*)sycl::malloc_shared(nnz * sizeof(fpType), queue);
+    fpType* x = (fpType*)sycl::malloc_shared(size * sizeof(fpType), queue);
+    fpType* y = (fpType*)sycl::malloc_shared(size * sizeof(fpType), queue);
-    // create execution queue and buffers of matrix data
-    sycl::queue cpu_queue(cpu_dev, exception_handler);
-    oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu> cpu_selector{ cpu_queue };
-    intType *ia, *ja;
-    fp *a, *x, *y, *z;
-    std::size_t sizea = static_cast<std::size_t>(27 * nrows);
-    std::size_t sizeja = static_cast<std::size_t>(27 * nrows);
-    std::size_t sizeia = static_cast<std::size_t>(nrows + 1);
-    std::size_t sizevec = static_cast<std::size_t>(nrows);
-    ia = (intType*)sycl::malloc_shared(sizeia * sizeof(intType), cpu_queue);
-    ja = (intType*)sycl::malloc_shared(sizeja * sizeof(intType), cpu_queue);
-    a = (fp*)sycl::malloc_shared(sizea * sizeof(fp), cpu_queue);
-    x = (fp*)sycl::malloc_shared(sizevec * sizeof(fp), cpu_queue);
-    y = (fp*)sycl::malloc_shared(sizevec * sizeof(fp), cpu_queue);
-    z = (fp*)sycl::malloc_shared(sizevec * sizeof(fp), cpu_queue);
-    if (!ia || !ja || !a || !x || !y || !z) {
+    if (!ia || !ja || !a || !x || !y) {
         throw std::runtime_error("Failed to allocate USM memory");
-    intType nnz = generate_sparse_matrix<fp, intType>(size, ia, ja, a);
+    // Copy ia and ja
+    queue.memcpy(ia, host_ia, nnz * sizeof(intType)).wait_and_throw();
+    queue.memcpy(ja, host_ja, nnz * sizeof(intType)).wait_and_throw();
+    // Init matrix values
+    for (int i = 0; i < nnz; i++) {
+        a[i] = set_fp_value(fpType(i + 1));
+    }
     // Init vectors x and y
-    for (int i = 0; i < nrows; i++) {
-        x[i] = set_fp_value(fp(1.0));
-        y[i] = set_fp_value(fp(0.0));
-        z[i] = set_fp_value(fp(0.0));
+    for (int i = 0; i < size; i++) {
+        x[i] = set_fp_value(fpType(i + 1));
+        y[i] = set_fp_value(fpType(0.0));
     std::vector<intType*> int_ptr_vec;
-    std::vector<fp*> fp_ptr_vec;
+    std::vector<fpType*> fp_ptr_vec;
-    fp_ptr_vec.push_back(z);
     // Execute Matrix Multiply
@@ -137,49 +122,52 @@ int run_sparse_matrix_vector_multiply_example(const sycl::device& cpu_dev) {
                       ? "nontrans"
                       : (transA == oneapi::mkl::transpose::trans ? "trans" : "conjtrans"))
               << std::endl;
-    std::cout << "\t\t\tnrows = " << nrows << std::endl;
+    std::cout << "\t\t\tsize = " << size << std::endl;
     std::cout << "\t\t\talpha = " << alpha << ", beta = " << beta << std::endl;
-    // Create and initialize handle for a Sparse Matrix in CSR format
+    // Create and initialize handle for a Sparse Matrix in COO format sorted by rows
     oneapi::mkl::sparse::matrix_handle_t A_handle = nullptr;
-    oneapi::mkl::sparse::init_csr_matrix(cpu_selector, &A_handle, nrows, nrows, nnz,
+    oneapi::mkl::sparse::init_coo_matrix(selector, &A_handle, size, size, nnz,
                                          oneapi::mkl::index_base::zero, ia, ja, a);
+    // cuSPARSE backend requires that the property sorted_by_rows or sorted is set when using matrices in COO format.
+    // Setting these properties is also the best practice to get best performance.
+    oneapi::mkl::sparse::set_matrix_property(selector, A_handle,
+                                             oneapi::mkl::sparse::matrix_property::sorted_by_rows);
     // Create and initialize dense vector handles
     oneapi::mkl::sparse::dense_vector_handle_t x_handle = nullptr;
     oneapi::mkl::sparse::dense_vector_handle_t y_handle = nullptr;
-    oneapi::mkl::sparse::init_dense_vector(cpu_selector, &x_handle, sizevec, x);
-    oneapi::mkl::sparse::init_dense_vector(cpu_selector, &y_handle, sizevec, y);
+    oneapi::mkl::sparse::init_dense_vector(selector, &x_handle, size, x);
+    oneapi::mkl::sparse::init_dense_vector(selector, &y_handle, size, y);
     // Create operation descriptor
     oneapi::mkl::sparse::spmv_descr_t descr = nullptr;
-    oneapi::mkl::sparse::init_spmv_descr(cpu_selector, &descr);
+    oneapi::mkl::sparse::init_spmv_descr(selector, &descr);
     // Allocate external workspace
     std::size_t workspace_size = 0;
-    oneapi::mkl::sparse::spmv_buffer_size(cpu_selector, transA, &alpha, A_view, A_handle, x_handle,
+    oneapi::mkl::sparse::spmv_buffer_size(selector, transA, &alpha, A_view, A_handle, x_handle,
                                           &beta, y_handle, alg, descr, workspace_size);
-    void* workspace = sycl::malloc_device(workspace_size, cpu_queue);
+    void* workspace = sycl::malloc_device(workspace_size, queue);
     // Optimize spmv
     auto ev_opt =
-        oneapi::mkl::sparse::spmv_optimize(cpu_selector, transA, &alpha, A_view, A_handle, x_handle,
+        oneapi::mkl::sparse::spmv_optimize(selector, transA, &alpha, A_view, A_handle, x_handle,
                                            &beta, y_handle, alg, descr, workspace);
     // Run spmv
-    auto ev_spmv = oneapi::mkl::sparse::spmv(cpu_selector, transA, &alpha, A_view, A_handle,
-                                             x_handle, &beta, y_handle, alg, descr, { ev_opt });
+    auto ev_spmv = oneapi::mkl::sparse::spmv(selector, transA, &alpha, A_view, A_handle, x_handle,
+                                             &beta, y_handle, alg, descr, { ev_opt });
     // Release handles and descriptor
     std::vector<sycl::event> release_events;
-        oneapi::mkl::sparse::release_dense_vector(cpu_selector, x_handle, { ev_spmv }));
+        oneapi::mkl::sparse::release_dense_vector(selector, x_handle, { ev_spmv }));
-        oneapi::mkl::sparse::release_dense_vector(cpu_selector, y_handle, { ev_spmv }));
+        oneapi::mkl::sparse::release_dense_vector(selector, y_handle, { ev_spmv }));
-        oneapi::mkl::sparse::release_sparse_matrix(cpu_selector, A_handle, { ev_spmv }));
-    release_events.push_back(
-        oneapi::mkl::sparse::release_spmv_descr(cpu_selector, descr, { ev_spmv }));
+        oneapi::mkl::sparse::release_sparse_matrix(selector, A_handle, { ev_spmv }));
+    release_events.push_back(oneapi::mkl::sparse::release_spmv_descr(selector, descr, { ev_spmv }));
     for (auto event : release_events) {
@@ -188,33 +176,26 @@ int run_sparse_matrix_vector_multiply_example(const sycl::device& cpu_dev) {
     // Post Processing
-    fp* res = y;
-    const bool isConj = (transA == oneapi::mkl::transpose::conjtrans);
-    for (intType row = 0; row < nrows; row++) {
-        z[row] *= beta;
-    }
-    for (intType row = 0; row < nrows; row++) {
-        fp tmp = alpha * x[row];
-        for (intType i = ia[row]; i < ia[row + 1]; i++) {
-            if constexpr (is_complex<fp>()) {
-                z[ja[i]] += tmp * (isConj ? std::conj(a[i]) : a[i]);
-            }
-            else {
-                z[ja[i]] += tmp * a[i];
-            }
-        }
+    // The example assume matrices are not transposed and beta=0 for simplicity.
+    // See the tests for more in-depth verification.
+    fpType* res = y;
+    fpType expected_res[size] = {};
+    for (intType i = 0; i < nnz; ++i) {
+        intType row = ia[i];
+        intType col = ja[i];
+        expected_res[row] += alpha * x[col] * a[i];
     bool good = true;
-    for (intType row = 0; row < nrows; row++) {
-        good &= check_result(res[row], z[row], nrows, row);
+    for (intType row = 0; row < size; row++) {
+        good &= check_result(res[row], expected_res[row], size, row);
     std::cout << "\n\t\t sparse::spmv example " << (good ? "passed" : "failed") << "\n\tFinished"
               << std::endl;
-    free_vec(fp_ptr_vec, cpu_queue);
-    free_vec(int_ptr_vec, cpu_queue);
+    free_vec(fp_ptr_vec, queue);
+    free_vec(int_ptr_vec, queue);
     if (!good)
         return 1;
@@ -234,7 +215,7 @@ void print_example_banner() {
     std::cout << "# " << std::endl;
     std::cout << "# y = alpha * op(A) * x + beta * y" << std::endl;
     std::cout << "# " << std::endl;
-    std::cout << "# where A is a sparse matrix in CSR format, x and y are "
+    std::cout << "# where A is a sparse matrix in COO format, x and y are "
                  "dense vectors"
               << std::endl;
     std::cout << "# and alpha, beta are floating point type precision scalars." << std::endl;
@@ -244,7 +225,7 @@ void print_example_banner() {
     std::cout << "# " << std::endl;
     std::cout << "# Using single precision (float) data type" << std::endl;
     std::cout << "# " << std::endl;
-    std::cout << "# Running on Intel CPU device" << std::endl;
+    std::cout << "# Running on both Intel CPU and Nvidia GPU devices" << std::endl;
     std::cout << "# " << std::endl;
     std::cout << "########################################################################"
               << std::endl;
@@ -257,17 +238,44 @@ void print_example_banner() {
 int main(int /*argc*/, char** /*argv*/) {
+    auto exception_handler = [](sycl::exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (sycl::exception const& e) {
+                std::cout << "Caught asynchronous SYCL "
+                             "exception during sparse::spmv:\n"
+                          << e.what() << std::endl;
+            }
+        }
+    };
     try {
-        // TODO: Add cuSPARSE compile-time dispatcher in this example once it is supported.
-        sycl::device cpu_dev(sycl::cpu_selector_v);
+        sycl::queue cpu_queue(sycl::cpu_selector_v, exception_handler);
+        sycl::queue gpu_queue(sycl::gpu_selector_v, exception_handler);
+        unsigned int vendor_id = gpu_queue.get_device().get_info<sycl::info::device::vendor_id>();
+        if (vendor_id != NVIDIA_ID) {
+            std::cerr << "FAILED: NVIDIA GPU device not found" << std::endl;
+            return 1;
+        }
+        oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu> cpu_selector{ cpu_queue };
+        oneapi::mkl::backend_selector<oneapi::mkl::backend::cusparse> gpu_selector{ gpu_queue };
-        std::cout << "Running Sparse BLAS SPMV USM example on CPU device." << std::endl;
-        std::cout << "Device name is: " << cpu_dev.get_info<sycl::info::device::name>()
+        std::cout << "Running Sparse BLAS SPMV USM example on:" << std::endl;
+        std::cout << "\tCPU device: " << cpu_queue.get_device().get_info<sycl::info::device::name>()
+                  << std::endl;
+        std::cout << "\tGPU device: " << gpu_queue.get_device().get_info<sycl::info::device::name>()
                   << std::endl;
         std::cout << "Running with single precision real data type:" << std::endl;
-        run_sparse_matrix_vector_multiply_example<float, std::int32_t>(cpu_dev);
-        std::cout << "Sparse BLAS SPMV USM example ran OK." << std::endl;
+        int err = run_sparse_matrix_vector_multiply_example<float, std::int32_t>(cpu_selector);
+        if (err)
+            return err;
+        err = run_sparse_matrix_vector_multiply_example<float, std::int32_t>(gpu_selector);
+        if (err)
+            return err;
+        std::cout << "Sparse BLAS SPMV USM example ran OK on MKLCPU and CUSPARSE." << std::endl;
     catch (sycl::exception const& e) {
         std::cerr << "Caught synchronous SYCL exception during Sparse SPMV:" << std::endl;
diff --git a/examples/sparse_blas/run_time_dispatching/CMakeLists.txt b/examples/sparse_blas/run_time_dispatching/CMakeLists.txt
index 398f3e0f2..f09daf819 100644
--- a/examples/sparse_blas/run_time_dispatching/CMakeLists.txt
+++ b/examples/sparse_blas/run_time_dispatching/CMakeLists.txt
@@ -33,6 +33,9 @@ endif()
   list(APPEND DEVICE_FILTERS "level_zero:gpu")
+  list(APPEND DEVICE_FILTERS "cuda:gpu")
 message(STATUS "ONEAPI_DEVICE_SELECTOR will be set to the following value(s): [${DEVICE_FILTERS}] for run-time dispatching examples")
diff --git a/include/oneapi/mkl/detail/backends.hpp b/include/oneapi/mkl/detail/backends.hpp
index 32b7c2614..216a6feba 100644
--- a/include/oneapi/mkl/detail/backends.hpp
+++ b/include/oneapi/mkl/detail/backends.hpp
@@ -40,20 +40,31 @@ enum class backend {
+    cusparse,
 typedef std::map<backend, std::string> backendmap;
-static backendmap backend_map = {
-    { backend::mklcpu, "mklcpu" },       { backend::mklgpu, "mklgpu" },
-    { backend::cublas, "cublas" },       { backend::cusolver, "cusolver" },
-    { backend::curand, "curand" },       { backend::netlib, "netlib" },
-    { backend::rocblas, "rocblas" },     { backend::rocrand, "rocrand" },
-    { backend::rocsolver, "rocsolver" }, { backend::portblas, "portblas" },
-    { backend::cufft, "cufft" },         { backend::rocfft, "rocfft" },
-    { backend::portfft, "portfft" },     { backend::unsupported, "unsupported" }
+// clang-format alternate the formatting depending on the parity of the number of backends
+// It is disabled to reduce noise
+// clang-format off
+static backendmap backend_map = { { backend::mklcpu, "mklcpu" },
+                                  { backend::mklgpu, "mklgpu" },
+                                  { backend::cublas, "cublas" },
+                                  { backend::cusolver, "cusolver" },
+                                  { backend::curand, "curand" },
+                                  { backend::netlib, "netlib" },
+                                  { backend::rocblas, "rocblas" },
+                                  { backend::rocrand, "rocrand" },
+                                  { backend::rocsolver, "rocsolver" },
+                                  { backend::portblas, "portblas" },
+                                  { backend::cufft, "cufft" },
+                                  { backend::rocfft, "rocfft" },
+                                  { backend::portfft, "portfft" },
+                                  { backend::cusparse, "cusparse" },
+                                  { backend::unsupported, "unsupported" } };
+// clang-format on
 } //namespace mkl
 } //namespace oneapi
diff --git a/include/oneapi/mkl/detail/backends_table.hpp b/include/oneapi/mkl/detail/backends_table.hpp
index 731781375..9b7c921d6 100644
--- a/include/oneapi/mkl/detail/backends_table.hpp
+++ b/include/oneapi/mkl/detail/backends_table.hpp
@@ -198,6 +198,12 @@ static std::map<domain, std::map<device, std::vector<const char*>>> libraries =
+          } },
+        { device::nvidiagpu,
+          {
+              LIB_NAME("sparse_blas_cusparse")
           } } } },
diff --git a/include/oneapi/mkl/sparse_blas.hpp b/include/oneapi/mkl/sparse_blas.hpp
index 004b79727..8fb86f244 100644
--- a/include/oneapi/mkl/sparse_blas.hpp
+++ b/include/oneapi/mkl/sparse_blas.hpp
@@ -34,6 +34,9 @@
 #include "sparse_blas/detail/mklgpu/sparse_blas_ct.hpp"
+#include "sparse_blas/detail/cusparse/sparse_blas_ct.hpp"
 #include "sparse_blas/detail/sparse_blas_rt.hpp"
diff --git a/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp b/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp
new file mode 100644
index 000000000..c8e816eeb
--- /dev/null
+++ b/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp
@@ -0,0 +1,33 @@
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+#include "oneapi/mkl/detail/export.hpp"
+#include "oneapi/mkl/sparse_blas/detail/helper_types.hpp"
+#include "oneapi/mkl/sparse_blas/types.hpp"
+namespace oneapi::mkl::sparse::cusparse {
+#include "oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx"
+} // namespace oneapi::mkl::sparse::cusparse
diff --git a/include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp b/include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp
new file mode 100644
index 000000000..11abb9a6f
--- /dev/null
+++ b/include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp
@@ -0,0 +1,40 @@
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+#include "oneapi/mkl/detail/backends.hpp"
+#include "oneapi/mkl/detail/backend_selector.hpp"
+#include "onemkl_sparse_blas_cusparse.hpp"
+namespace oneapi {
+namespace mkl {
+namespace sparse {
+#define BACKEND cusparse
+#include "oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx"
+#undef BACKEND
+} //namespace sparse
+} //namespace mkl
+} //namespace oneapi
diff --git a/include/oneapi/mkl/sparse_blas/types.hpp b/include/oneapi/mkl/sparse_blas/types.hpp
index d619be4b3..1a50d6ef4 100644
--- a/include/oneapi/mkl/sparse_blas/types.hpp
+++ b/include/oneapi/mkl/sparse_blas/types.hpp
@@ -36,6 +36,7 @@ namespace sparse {
 enum class matrix_property {
+    sorted_by_rows,
 enum class spmm_alg {
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6ff8d5d11..c363d8a8d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -59,6 +59,7 @@ function(generate_header_file)
   configure_file( "${CMAKE_CURRENT_BINARY_DIR}/oneapi/mkl/config.hpp.configured")
diff --git a/src/ b/src/
index de44cb16b..5d8b9a136 100644
--- a/src/
+++ b/src/
@@ -24,6 +24,7 @@
diff --git a/src/sparse_blas/backends/CMakeLists.txt b/src/sparse_blas/backends/CMakeLists.txt
index 294040808..baae9445d 100644
--- a/src/sparse_blas/backends/CMakeLists.txt
+++ b/src/sparse_blas/backends/CMakeLists.txt
@@ -27,3 +27,7 @@ endif()
+  add_subdirectory(cusparse)
diff --git a/src/sparse_blas/backends/cusparse/CMakeLists.txt b/src/sparse_blas/backends/cusparse/CMakeLists.txt
new file mode 100644
index 000000000..60bbaf35f
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/CMakeLists.txt
@@ -0,0 +1,85 @@
+# Copyright 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# See the License for the specific language governing permissions
+# and limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
+set(LIB_NAME onemkl_sparse_blas_cusparse)
+set(LIB_OBJ ${LIB_NAME}_obj)
+add_library(${LIB_OBJ} OBJECT
+  cusparse_handles.cpp
+  cusparse_scope_handle.cpp
+  operations/cusparse_spmm.cpp
+  operations/cusparse_spmv.cpp
+  operations/cusparse_spsv.cpp
+  $<$<BOOL:${BUILD_SHARED_LIBS}>: cusparse_wrappers.cpp>
+add_dependencies(onemkl_backend_libs_sparse_blas ${LIB_NAME})
+          ${PROJECT_SOURCE_DIR}/src
+          ${CMAKE_BINARY_DIR}/bin
+target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
+  find_package(CUDA 12.2 REQUIRED)
+  target_include_directories(${LIB_OBJ} PRIVATE ${CUDA_INCLUDE_DIRS})
+  target_link_libraries(${LIB_OBJ} PUBLIC cuda rt ${CUDA_cusparse_LIBRARY})
+  find_package(CUDAToolkit 12.2 REQUIRED)
+  target_link_libraries(${LIB_OBJ} PRIVATE CUDA::cusparse CUDA::cudart CUDA::cuda_driver)
+  PRIVATE onemkl_warnings
+set_target_properties(${LIB_OBJ} PROPERTIES
+target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
+#Set oneMKL libraries as not transitive for dynamic
+  set_target_properties(${LIB_NAME} PROPERTIES
+  )
+# Add major version to the library
+set_target_properties(${LIB_NAME} PROPERTIES
+# Add dependencies rpath to the library
+# Add the library to install package
+install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
+install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
diff --git a/src/sparse_blas/backends/cusparse/cusparse_error.hpp b/src/sparse_blas/backends/cusparse/cusparse_error.hpp
new file mode 100644
index 000000000..738888576
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_error.hpp
@@ -0,0 +1,103 @@
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+#include <string>
+#include <cuda.h>
+#include <cusparse.h>
+#include "oneapi/mkl/exceptions.hpp"
+namespace oneapi::mkl::sparse::cusparse::detail {
+inline std::string cuda_result_to_str(CUresult result) {
+    switch (result) {
+    case STATUS: return #STATUS
+        default: return "<unknown>";
+    }
+#define CUDA_ERROR_FUNC(func, ...)                                                          \
+    do {                                                                                    \
+        auto res = func(__VA_ARGS__);                                                       \
+        if (res != CUDA_SUCCESS) {                                                          \
+            throw oneapi::mkl::exception("sparse_blas", #func,                              \
+                                         "cuda error: " + detail::cuda_result_to_str(res)); \
+        }                                                                                   \
+    } while (0)
+inline std::string cusparse_status_to_str(cusparseStatus_t status) {
+    switch (status) {
+    case STATUS: return #STATUS
+        default: return "<unknown>";
+    }
+inline void check_status(cusparseStatus_t status, const std::string& function,
+                         std::string error_str = "") {
+    if (status != CUSPARSE_STATUS_SUCCESS) {
+        if (!error_str.empty()) {
+            error_str += "; ";
+        }
+        error_str += "cuSPARSE status: " + cusparse_status_to_str(status);
+        switch (status) {
+                throw oneapi::mkl::unimplemented("sparse_blas", function, error_str);
+                throw oneapi::mkl::uninitialized("sparse_blas", function, error_str);
+                throw oneapi::mkl::invalid_argument("sparse_blas", function, error_str);
+            default: throw oneapi::mkl::exception("sparse_blas", function, error_str);
+        }
+    }
+#define CUSPARSE_ERR_FUNC(func, ...)         \
+    do {                                     \
+        auto status = func(__VA_ARGS__);     \
+        detail::check_status(status, #func); \
+    } while (0)
+} // namespace oneapi::mkl::sparse::cusparse::detail
diff --git a/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp b/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp
new file mode 100644
index 000000000..179b007f5
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp
@@ -0,0 +1,63 @@
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+ * @file Similar to blas_handle.hpp
+ * Provides a map from a ur_context_handle_t (or equivalent) to a cusparseHandle_t.
+ * @see cusparse_scope_handle.hpp
+#include <atomic>
+#include <unordered_map>
+namespace oneapi::mkl::sparse::cusparse::detail {
+template <typename T>
+struct cusparse_global_handle {
+    using handle_container_t = std::unordered_map<T, std::atomic<cusparseHandle_t>*>;
+    handle_container_t cusparse_global_handle_mapper_{};
+    ~cusparse_global_handle() noexcept(false) {
+        for (auto& handle_pair : cusparse_global_handle_mapper_) {
+            if (handle_pair.second != nullptr) {
+                auto handle = handle_pair.second->exchange(nullptr);
+                if (handle != nullptr) {
+                    CUSPARSE_ERR_FUNC(cusparseDestroy, handle);
+                    handle = nullptr;
+                }
+                else {
+                    // if the handle is nullptr it means the handle was already
+                    // destroyed by the ContextCallback and we're free to delete the
+                    // atomic object.
+                    delete handle_pair.second;
+                }
+                handle_pair.second = nullptr;
+            }
+        }
+        cusparse_global_handle_mapper_.clear();
+    }
+} // namespace oneapi::mkl::sparse::cusparse::detail
diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp
new file mode 100644
index 000000000..ff3d8fcae
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp
@@ -0,0 +1,485 @@
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp"
+#include "cusparse_error.hpp"
+#include "cusparse_helper.hpp"
+#include "cusparse_handles.hpp"
+#include "cusparse_task.hpp"
+#include "sparse_blas/macros.hpp"
+namespace oneapi::mkl::sparse::cusparse {
+ * In this file CusparseScopedContextHandler are used to ensure that a cusparseHandle_t is created before any other cuSPARSE call, as required by the specification.
+// Dense vector
+template <typename fpType>
+void init_dense_vector(sycl::queue& queue, dense_vector_handle_t* p_dvhandle, std::int64_t size,
+                       sycl::buffer<fpType, 1> val) {
+    auto event = queue.submit([&](sycl::handler& cgh) {
+        auto acc = val.template get_access<sycl::access::mode::read_write>(cgh);
+        detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            detail::CusparseScopedContextHandler(queue, ih).get_handle(queue);
+            auto cuda_value_type = detail::CudaEnumType<fpType>::value;
+            cusparseDnVecDescr_t cu_dvhandle;
+            CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &cu_dvhandle, size, detail::get_mem(ih, acc),
+                              cuda_value_type);
+            *p_dvhandle = new dense_vector_handle(cu_dvhandle, val, size);
+        });
+    });
+    event.wait_and_throw();
+template <typename fpType>
+void init_dense_vector(sycl::queue& queue, dense_vector_handle_t* p_dvhandle, std::int64_t size,
+                       fpType* val) {
+    auto event = queue.submit([&](sycl::handler& cgh) {
+        detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            detail::CusparseScopedContextHandler(queue, ih).get_handle(queue);
+            auto cuda_value_type = detail::CudaEnumType<fpType>::value;
+            cusparseDnVecDescr_t cu_dvhandle;
+            CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &cu_dvhandle, size, val, cuda_value_type);
+            *p_dvhandle = new dense_vector_handle(cu_dvhandle, val, size);
+        });
+    });
+    event.wait_and_throw();
+template <typename fpType>
+void set_dense_vector_data(sycl::queue& queue, dense_vector_handle_t dvhandle, std::int64_t size,
+                           sycl::buffer<fpType, 1> val) {
+    detail::check_can_reset_value_handle<fpType>(__func__, dvhandle, true);
+    auto event = queue.submit([&](sycl::handler& cgh) {
+        auto acc = val.template get_access<sycl::access::mode::read_write>(cgh);
+        detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+            if (dvhandle->size != size) {
+                CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle);
+                auto cuda_value_type = detail::CudaEnumType<fpType>::value;
+                CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &dvhandle->backend_handle, size,
+                                  detail::get_mem(ih, acc), cuda_value_type);
+                dvhandle->size = size;
+            }
+            else {
+                CUSPARSE_ERR_FUNC(cusparseDnVecSetValues, dvhandle->backend_handle,
+                                  detail::get_mem(ih, acc));
+            }
+            dvhandle->set_buffer(val);
+        });
+    });
+    event.wait_and_throw();
+template <typename fpType>
+void set_dense_vector_data(sycl::queue&, dense_vector_handle_t dvhandle, std::int64_t size,
+                           fpType* val) {
+    detail::check_can_reset_value_handle<fpType>(__func__, dvhandle, false);
+    if (dvhandle->size != size) {
+        CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle);
+        auto cuda_value_type = detail::CudaEnumType<fpType>::value;
+        CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &dvhandle->backend_handle, size, val,
+                          cuda_value_type);
+        dvhandle->size = size;
+    }
+    else {
+        CUSPARSE_ERR_FUNC(cusparseDnVecSetValues, dvhandle->backend_handle, val);
+    }
+    dvhandle->set_usm_ptr(val);
+sycl::event release_dense_vector(sycl::queue& queue, dense_vector_handle_t dvhandle,
+                                 const std::vector<sycl::event>& dependencies) {
+    // Use dispatch_submit_impl_fp to ensure the backend's handle is kept alive as long as the buffer is used
+    auto functor = [=](sycl::interop_handle) {
+        CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle);
+        delete dvhandle;
+    };
+    return detail::dispatch_submit_impl_fp(__func__, queue, dependencies, functor, dvhandle);
+// Dense matrix
+template <typename fpType>
+void init_dense_matrix(sycl::queue& queue, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows,
+                       std::int64_t num_cols, std::int64_t ld, layout dense_layout,
+                       sycl::buffer<fpType, 1> val) {
+    auto event = queue.submit([&](sycl::handler& cgh) {
+        auto acc = val.template get_access<sycl::access::mode::read_write>(cgh);
+        detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            detail::CusparseScopedContextHandler(queue, ih).get_handle(queue);
+            auto cuda_value_type = detail::CudaEnumType<fpType>::value;
+            auto cuda_order = detail::get_cuda_order(dense_layout);
+            cusparseDnMatDescr_t cu_dmhandle;
+            CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &cu_dmhandle, num_rows, num_cols, ld,
+                              detail::get_mem(ih, acc), cuda_value_type, cuda_order);
+            *p_dmhandle =
+                new dense_matrix_handle(cu_dmhandle, val, num_rows, num_cols, ld, dense_layout);
+        });
+    });
+    event.wait_and_throw();
+template <typename fpType>
+void init_dense_matrix(sycl::queue& queue, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows,
+                       std::int64_t num_cols, std::int64_t ld, layout dense_layout, fpType* val) {
+    auto event = queue.submit([&](sycl::handler& cgh) {
+        detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            detail::CusparseScopedContextHandler(queue, ih).get_handle(queue);
+            auto cuda_value_type = detail::CudaEnumType<fpType>::value;
+            auto cuda_order = detail::get_cuda_order(dense_layout);
+            cusparseDnMatDescr_t cu_dmhandle;
+            CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &cu_dmhandle, num_rows, num_cols, ld, val,
+                              cuda_value_type, cuda_order);
+            *p_dmhandle =
+                new dense_matrix_handle(cu_dmhandle, val, num_rows, num_cols, ld, dense_layout);
+        });
+    });
+    event.wait_and_throw();
+template <typename fpType>
+void set_dense_matrix_data(sycl::queue& queue, dense_matrix_handle_t dmhandle,
+                           std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,
+                           oneapi::mkl::layout dense_layout, sycl::buffer<fpType, 1> val) {
+    detail::check_can_reset_value_handle<fpType>(__func__, dmhandle, true);
+    auto event = queue.submit([&](sycl::handler& cgh) {
+        auto acc = val.template get_access<sycl::access::mode::read_write>(cgh);
+        detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+            if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols ||
+                dmhandle->ld != ld || dmhandle->dense_layout != dense_layout) {
+                CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle);
+                auto cuda_value_type = detail::CudaEnumType<fpType>::value;
+                auto cuda_order = detail::get_cuda_order(dense_layout);
+                CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &dmhandle->backend_handle, num_rows,
+                                  num_cols, ld, detail::get_mem(ih, acc), cuda_value_type,
+                                  cuda_order);
+                dmhandle->num_rows = num_rows;
+                dmhandle->num_cols = num_cols;
+                dmhandle->ld = ld;
+                dmhandle->dense_layout = dense_layout;
+            }
+            else {
+                CUSPARSE_ERR_FUNC(cusparseDnMatSetValues, dmhandle->backend_handle,
+                                  detail::get_mem(ih, acc));
+            }
+            dmhandle->set_buffer(val);
+        });
+    });
+    event.wait_and_throw();
+template <typename fpType>
+void set_dense_matrix_data(sycl::queue&, dense_matrix_handle_t dmhandle, std::int64_t num_rows,
+                           std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout,
+                           fpType* val) {
+    detail::check_can_reset_value_handle<fpType>(__func__, dmhandle, false);
+    if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols || dmhandle->ld != ld ||
+        dmhandle->dense_layout != dense_layout) {
+        CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle);
+        auto cuda_value_type = detail::CudaEnumType<fpType>::value;
+        auto cuda_order = detail::get_cuda_order(dense_layout);
+        CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &dmhandle->backend_handle, num_rows, num_cols, ld,
+                          val, cuda_value_type, cuda_order);
+        dmhandle->num_rows = num_rows;
+        dmhandle->num_cols = num_cols;
+        dmhandle->ld = ld;
+        dmhandle->dense_layout = dense_layout;
+    }
+    else {
+        CUSPARSE_ERR_FUNC(cusparseDnMatSetValues, dmhandle->backend_handle, val);
+    }
+    dmhandle->set_usm_ptr(val);
+sycl::event release_dense_matrix(sycl::queue& queue, dense_matrix_handle_t dmhandle,
+                                 const std::vector<sycl::event>& dependencies) {
+    // Use dispatch_submit_impl_fp to ensure the backend's handle is kept alive as long as the buffer is used
+    auto functor = [=](sycl::interop_handle) {
+        CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle);
+        delete dmhandle;
+    };
+    return detail::dispatch_submit_impl_fp(__func__, queue, dependencies, functor, dmhandle);
+// COO matrix
+template <typename fpType, typename intType>
+void init_coo_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows,
+                     std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+                     sycl::buffer<intType, 1> row_ind, sycl::buffer<intType, 1> col_ind,
+                     sycl::buffer<fpType, 1> val) {
+    auto event = queue.submit([&](sycl::handler& cgh) {
+        auto row_acc = row_ind.template get_access<sycl::access::mode::read_write>(cgh);
+        auto col_acc = col_ind.template get_access<sycl::access::mode::read_write>(cgh);
+        auto val_acc = val.template get_access<sycl::access::mode::read_write>(cgh);
+        detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            detail::CusparseScopedContextHandler(queue, ih).get_handle(queue);
+            auto cuda_index_type = detail::CudaIndexEnumType<intType>::value;
+            auto cuda_index_base = detail::get_cuda_index_base(index);
+            auto cuda_value_type = detail::CudaEnumType<fpType>::value;
+            cusparseSpMatDescr_t cu_smhandle;
+            CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz,
+                              detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc),
+                              detail::get_mem(ih, val_acc), cuda_index_type, cuda_index_base,
+                              cuda_value_type);
+            *p_smhandle =
+                new matrix_handle(cu_smhandle, row_ind, col_ind, val, detail::sparse_format::COO,
+                                  num_rows, num_cols, nnz, index);
+        });
+    });
+    event.wait_and_throw();
+template <typename fpType, typename intType>
+void init_coo_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows,
+                     std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+                     intType* row_ind, intType* col_ind, fpType* val) {
+    auto event = queue.submit([&](sycl::handler& cgh) {
+        detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            detail::CusparseScopedContextHandler(queue, ih).get_handle(queue);
+            auto cuda_index_type = detail::CudaIndexEnumType<intType>::value;
+            auto cuda_index_base = detail::get_cuda_index_base(index);
+            auto cuda_value_type = detail::CudaEnumType<fpType>::value;
+            cusparseSpMatDescr_t cu_smhandle;
+            CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz, row_ind,
+                              col_ind, val, cuda_index_type, cuda_index_base, cuda_value_type);
+            *p_smhandle =
+                new matrix_handle(cu_smhandle, row_ind, col_ind, val, detail::sparse_format::COO,
+                                  num_rows, num_cols, nnz, index);
+        });
+    });
+    event.wait_and_throw();
+template <typename fpType, typename intType>
+void set_coo_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows,
+                         std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+                         sycl::buffer<intType, 1> row_ind, sycl::buffer<intType, 1> col_ind,
+                         sycl::buffer<fpType, 1> val) {
+    detail::check_can_reset_sparse_handle<fpType, intType>(__func__, smhandle, true);
+    auto event = queue.submit([&](sycl::handler& cgh) {
+        auto row_acc = row_ind.template get_access<sycl::access::mode::read_write>(cgh);
+        auto col_acc = col_ind.template get_access<sycl::access::mode::read_write>(cgh);
+        auto val_acc = val.template get_access<sycl::access::mode::read_write>(cgh);
+        detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+            if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols ||
+                smhandle->nnz != nnz || smhandle->index != index) {
+                CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle);
+                auto cuda_index_type = detail::CudaIndexEnumType<intType>::value;
+                auto cuda_index_base = detail::get_cuda_index_base(index);
+                auto cuda_value_type = detail::CudaEnumType<fpType>::value;
+                CUSPARSE_ERR_FUNC(cusparseCreateCoo, &smhandle->backend_handle, num_rows, num_cols,
+                                  nnz, detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc),
+                                  detail::get_mem(ih, val_acc), cuda_index_type, cuda_index_base,
+                                  cuda_value_type);
+                smhandle->num_rows = num_rows;
+                smhandle->num_cols = num_cols;
+                smhandle->nnz = nnz;
+                smhandle->index = index;
+            }
+            else {
+                CUSPARSE_ERR_FUNC(cusparseCooSetPointers, smhandle->backend_handle,
+                                  detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc),
+                                  detail::get_mem(ih, val_acc));
+            }
+            smhandle->row_container.set_buffer(row_ind);
+            smhandle->col_container.set_buffer(col_ind);
+            smhandle->value_container.set_buffer(val);
+        });
+    });
+    event.wait_and_throw();
+template <typename fpType, typename intType>
+void set_coo_matrix_data(sycl::queue&, matrix_handle_t smhandle, std::int64_t num_rows,
+                         std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+                         intType* row_ind, intType* col_ind, fpType* val) {
+    detail::check_can_reset_sparse_handle<fpType, intType>(__func__, smhandle, false);
+    if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || smhandle->nnz != nnz ||
+        smhandle->index != index) {
+        CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle);
+        auto cuda_index_type = detail::CudaIndexEnumType<intType>::value;
+        auto cuda_index_base = detail::get_cuda_index_base(index);
+        auto cuda_value_type = detail::CudaEnumType<fpType>::value;
+        CUSPARSE_ERR_FUNC(cusparseCreateCoo, &smhandle->backend_handle, num_rows, num_cols, nnz,
+                          row_ind, col_ind, val, cuda_index_type, cuda_index_base, cuda_value_type);
+        smhandle->num_rows = num_rows;
+        smhandle->num_cols = num_cols;
+        smhandle->nnz = nnz;
+        smhandle->index = index;
+    }
+    else {
+        CUSPARSE_ERR_FUNC(cusparseCooSetPointers, smhandle->backend_handle, row_ind, col_ind, val);
+    }
+    smhandle->row_container.set_usm_ptr(row_ind);
+    smhandle->col_container.set_usm_ptr(col_ind);
+    smhandle->value_container.set_usm_ptr(val);
+// CSR matrix
+template <typename fpType, typename intType>
+void init_csr_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows,
+                     std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+                     sycl::buffer<intType, 1> row_ptr, sycl::buffer<intType, 1> col_ind,
+                     sycl::buffer<fpType, 1> val) {
+    auto event = queue.submit([&](sycl::handler& cgh) {
+        auto row_acc = row_ptr.template get_access<sycl::access::mode::read_write>(cgh);
+        auto col_acc = col_ind.template get_access<sycl::access::mode::read_write>(cgh);
+        auto val_acc = val.template get_access<sycl::access::mode::read_write>(cgh);
+        detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            detail::CusparseScopedContextHandler(queue, ih).get_handle(queue);
+            auto cuda_index_type = detail::CudaIndexEnumType<intType>::value;
+            auto cuda_index_base = detail::get_cuda_index_base(index);
+            auto cuda_value_type = detail::CudaEnumType<fpType>::value;
+            cusparseSpMatDescr_t cu_smhandle;
+            CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz,
+                              detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc),
+                              detail::get_mem(ih, val_acc), cuda_index_type, cuda_index_type,
+                              cuda_index_base, cuda_value_type);
+            *p_smhandle =
+                new matrix_handle(cu_smhandle, row_ptr, col_ind, val, detail::sparse_format::CSR,
+                                  num_rows, num_cols, nnz, index);
+        });
+    });
+    event.wait_and_throw();
+template <typename fpType, typename intType>
+void init_csr_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows,
+                     std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+                     intType* row_ptr, intType* col_ind, fpType* val) {
+    auto event = queue.submit([&](sycl::handler& cgh) {
+        detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            detail::CusparseScopedContextHandler(queue, ih).get_handle(queue);
+            auto cuda_index_type = detail::CudaIndexEnumType<intType>::value;
+            auto cuda_index_base = detail::get_cuda_index_base(index);
+            auto cuda_value_type = detail::CudaEnumType<fpType>::value;
+            cusparseSpMatDescr_t cu_smhandle;
+            CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz, row_ptr,
+                              col_ind, val, cuda_index_type, cuda_index_type, cuda_index_base,
+                              cuda_value_type);
+            *p_smhandle =
+                new matrix_handle(cu_smhandle, row_ptr, col_ind, val, detail::sparse_format::CSR,
+                                  num_rows, num_cols, nnz, index);
+        });
+    });
+    event.wait_and_throw();
+template <typename fpType, typename intType>
+void set_csr_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows,
+                         std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+                         sycl::buffer<intType, 1> row_ptr, sycl::buffer<intType, 1> col_ind,
+                         sycl::buffer<fpType, 1> val) {
+    detail::check_can_reset_sparse_handle<fpType, intType>(__func__, smhandle, true);
+    auto event = queue.submit([&](sycl::handler& cgh) {
+        auto row_acc = row_ptr.template get_access<sycl::access::mode::read_write>(cgh);
+        auto col_acc = col_ind.template get_access<sycl::access::mode::read_write>(cgh);
+        auto val_acc = val.template get_access<sycl::access::mode::read_write>(cgh);
+        detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) {
+            if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols ||
+                smhandle->nnz != nnz || smhandle->index != index) {
+                CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle);
+                auto cuda_index_type = detail::CudaIndexEnumType<intType>::value;
+                auto cuda_index_base = detail::get_cuda_index_base(index);
+                auto cuda_value_type = detail::CudaEnumType<fpType>::value;
+                CUSPARSE_ERR_FUNC(cusparseCreateCsr, &smhandle->backend_handle, num_rows, num_cols,
+                                  nnz, detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc),
+                                  detail::get_mem(ih, val_acc), cuda_index_type, cuda_index_type,
+                                  cuda_index_base, cuda_value_type);
+                smhandle->num_rows = num_rows;
+                smhandle->num_cols = num_cols;
+                smhandle->nnz = nnz;
+                smhandle->index = index;
+            }
+            else {
+                CUSPARSE_ERR_FUNC(cusparseCsrSetPointers, smhandle->backend_handle,
+                                  detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc),
+                                  detail::get_mem(ih, val_acc));
+            }
+            smhandle->row_container.set_buffer(row_ptr);
+            smhandle->col_container.set_buffer(col_ind);
+            smhandle->value_container.set_buffer(val);
+        });
+    });
+    event.wait_and_throw();
+template <typename fpType, typename intType>
+void set_csr_matrix_data(sycl::queue&, matrix_handle_t smhandle, std::int64_t num_rows,
+                         std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+                         intType* row_ptr, intType* col_ind, fpType* val) {
+    detail::check_can_reset_sparse_handle<fpType, intType>(__func__, smhandle, false);
+    if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || smhandle->nnz != nnz ||
+        smhandle->index != index) {
+        CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle);
+        auto cuda_index_type = detail::CudaIndexEnumType<intType>::value;
+        auto cuda_index_base = detail::get_cuda_index_base(index);
+        auto cuda_value_type = detail::CudaEnumType<fpType>::value;
+        CUSPARSE_ERR_FUNC(cusparseCreateCsr, &smhandle->backend_handle, num_rows, num_cols, nnz,
+                          row_ptr, col_ind, val, cuda_index_type, cuda_index_type, cuda_index_base,
+                          cuda_value_type);
+        smhandle->num_rows = num_rows;
+        smhandle->num_cols = num_cols;
+        smhandle->nnz = nnz;
+        smhandle->index = index;
+    }
+    else {
+        CUSPARSE_ERR_FUNC(cusparseCsrSetPointers, smhandle->backend_handle, row_ptr, col_ind, val);
+    }
+    smhandle->row_container.set_usm_ptr(row_ptr);
+    smhandle->col_container.set_usm_ptr(col_ind);
+    smhandle->value_container.set_usm_ptr(val);
+sycl::event release_sparse_matrix(sycl::queue& queue, matrix_handle_t smhandle,
+                                  const std::vector<sycl::event>& dependencies) {
+    // Use dispatch_submit to ensure the backend's handle is kept alive as long as the buffers are used
+    auto functor = [=](sycl::interop_handle) {
+        CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle);
+        delete smhandle;
+    };
+    return detail::dispatch_submit(__func__, queue, dependencies, functor, smhandle);
+// Matrix property
+bool set_matrix_property(sycl::queue&, matrix_handle_t smhandle, matrix_property property) {
+    // No equivalent in cuSPARSE
+    // Store the matrix property internally for future usages
+    smhandle->set_matrix_property(property);
+    return false;
+} // namespace oneapi::mkl::sparse::cusparse
diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.hpp b/src/sparse_blas/backends/cusparse/cusparse_handles.hpp
new file mode 100644
index 000000000..5e5bdc732
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_handles.hpp
@@ -0,0 +1,95 @@
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+#include <cusparse.h>
+#include "sparse_blas/generic_container.hpp"
+namespace oneapi::mkl::sparse {
+// Complete the definition of incomplete types dense_vector_handle, dense_matrix_handle and matrix_handle.
+struct dense_vector_handle : public detail::generic_dense_vector_handle<cusparseDnVecDescr_t> {
+    template <typename T>
+    dense_vector_handle(cusparseDnVecDescr_t cu_descr, T* value_ptr, std::int64_t size)
+            : detail::generic_dense_vector_handle<cusparseDnVecDescr_t>(cu_descr, value_ptr, size) {
+    }
+    template <typename T>
+    dense_vector_handle(cusparseDnVecDescr_t cu_descr, const sycl::buffer<T, 1> value_buffer,
+                        std::int64_t size)
+            : detail::generic_dense_vector_handle<cusparseDnVecDescr_t>(cu_descr, value_buffer,
+                                                                        size) {}
+struct dense_matrix_handle : public detail::generic_dense_matrix_handle<cusparseDnMatDescr_t> {
+    template <typename T>
+    dense_matrix_handle(cusparseDnMatDescr_t cu_descr, T* value_ptr, std::int64_t num_rows,
+                        std::int64_t num_cols, std::int64_t ld, layout dense_layout)
+            : detail::generic_dense_matrix_handle<cusparseDnMatDescr_t>(
+                  cu_descr, value_ptr, num_rows, num_cols, ld, dense_layout) {}
+    template <typename T>
+    dense_matrix_handle(cusparseDnMatDescr_t cu_descr, const sycl::buffer<T, 1> value_buffer,
+                        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,
+                        layout dense_layout)
+            : detail::generic_dense_matrix_handle<cusparseDnMatDescr_t>(
+                  cu_descr, value_buffer, num_rows, num_cols, ld, dense_layout) {}
+struct matrix_handle : public detail::generic_sparse_handle<cusparseSpMatDescr_t> {
+    template <typename fpType, typename intType>
+    matrix_handle(cusparseSpMatDescr_t cu_descr, intType* row_ptr, intType* col_ptr,
+                  fpType* value_ptr, detail::sparse_format format, std::int64_t num_rows,
+                  std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index)
+            : detail::generic_sparse_handle<cusparseSpMatDescr_t>(
+                  cu_descr, row_ptr, col_ptr, value_ptr, format, num_rows, num_cols, nnz, index) {}
+    template <typename fpType, typename intType>
+    matrix_handle(cusparseSpMatDescr_t cu_descr, const sycl::buffer<intType, 1> row_buffer,
+                  const sycl::buffer<intType, 1> col_buffer,
+                  const sycl::buffer<fpType, 1> value_buffer, detail::sparse_format format,
+                  std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,
+                  oneapi::mkl::index_base index)
+            : detail::generic_sparse_handle<cusparseSpMatDescr_t>(cu_descr, row_buffer, col_buffer,
+                                                                  value_buffer, format, num_rows,
+                                                                  num_cols, nnz, index) {}
+namespace detail {
+inline void check_valid_matrix_properties(const std::string& function_name,
+                                          matrix_handle_t sm_handle) {
+    if (sm_handle->format == sparse_format::COO &&
+        !(sm_handle->has_matrix_property(matrix_property::sorted_by_rows) ||
+          sm_handle->has_matrix_property(matrix_property::sorted))) {
+        throw mkl::unimplemented(
+            "sparse_blas", function_name,
+            "The backend does not support unsorted COO format. Use `set_matrix_property` to set the property `matrix_property::sorted_by_rows` or `matrix_property::sorted`");
+    }
+} // namespace detail
+} // namespace oneapi::mkl::sparse
diff --git a/src/sparse_blas/backends/cusparse/cusparse_helper.hpp b/src/sparse_blas/backends/cusparse/cusparse_helper.hpp
new file mode 100644
index 000000000..3feb4bcad
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_helper.hpp
@@ -0,0 +1,166 @@
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+#include <complex>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <cusparse.h>
+#include "oneapi/mkl/sparse_blas/types.hpp"
+#include "sparse_blas/enum_data_types.hpp"
+#include "sparse_blas/sycl_helper.hpp"
+#include "cusparse_error.hpp"
+namespace oneapi::mkl::sparse::cusparse::detail {
+using namespace oneapi::mkl::sparse::detail;
+template <typename T>
+struct CudaEnumType;
+template <>
+struct CudaEnumType<float> {
+    static constexpr cudaDataType_t value = CUDA_R_32F;
+template <>
+struct CudaEnumType<double> {
+    static constexpr cudaDataType_t value = CUDA_R_64F;
+template <>
+struct CudaEnumType<std::complex<float>> {
+    static constexpr cudaDataType_t value = CUDA_C_32F;
+template <>
+struct CudaEnumType<std::complex<double>> {
+    static constexpr cudaDataType_t value = CUDA_C_64F;
+template <typename T>
+struct CudaIndexEnumType;
+template <>
+struct CudaIndexEnumType<std::int32_t> {
+    static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
+template <>
+struct CudaIndexEnumType<std::int64_t> {
+    static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_64I;
+template <typename E>
+inline std::string cast_enum_to_str(E e) {
+    return std::to_string(static_cast<char>(e));
+inline cudaDataType_t get_cuda_value_type(data_type onemkl_data_type) {
+    switch (onemkl_data_type) {
+        case data_type::real_fp32: return CUDA_R_32F;
+        case data_type::real_fp64: return CUDA_R_64F;
+        case data_type::complex_fp32: return CUDA_C_32F;
+        case data_type::complex_fp64: return CUDA_C_64F;
+        default:
+            throw oneapi::mkl::invalid_argument(
+                "sparse_blas", "get_cuda_value_type",
+                "Invalid data type: " + cast_enum_to_str(onemkl_data_type));
+    }
+inline cusparseOrder_t get_cuda_order(layout l) {
+    switch (l) {
+        case layout::row_major: return CUSPARSE_ORDER_ROW;
+        case layout::col_major: return CUSPARSE_ORDER_COL;
+        default:
+            throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_order",
+                                                "Unknown layout: " + cast_enum_to_str(l));
+    }
+inline cusparseIndexBase_t get_cuda_index_base(index_base index) {
+    switch (index) {
+        case index_base::zero: return CUSPARSE_INDEX_BASE_ZERO;
+        case index_base::one: return CUSPARSE_INDEX_BASE_ONE;
+        default:
+            throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_index_base",
+                                                "Unknown index_base: " + cast_enum_to_str(index));
+    }
+/// Return the CUDA transpose operation from a oneMKL type.
+/// Do not conjugate for real types to avoid an invalid argument.
+inline cusparseOperation_t get_cuda_operation(data_type type, transpose op) {
+    switch (op) {
+        case transpose::nontrans: return CUSPARSE_OPERATION_NON_TRANSPOSE;
+        case transpose::trans: return CUSPARSE_OPERATION_TRANSPOSE;
+        case transpose::conjtrans:
+            return (type == data_type::complex_fp32 || type == data_type::complex_fp64)
+                       : CUSPARSE_OPERATION_TRANSPOSE;
+        default:
+            throw oneapi::mkl::invalid_argument(
+                "sparse_blas", "get_cuda_operation",
+                "Unknown transpose operation: " + cast_enum_to_str(op));
+    }
+inline auto get_cuda_uplo(uplo uplo_val) {
+    switch (uplo_val) {
+        case uplo::upper: return CUSPARSE_FILL_MODE_UPPER;
+        case uplo::lower: return CUSPARSE_FILL_MODE_LOWER;
+        default:
+            throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_uplo",
+                                                "Unknown uplo: " + cast_enum_to_str(uplo_val));
+    }
+inline auto get_cuda_diag(diag diag_val) {
+    switch (diag_val) {
+        case diag::nonunit: return CUSPARSE_DIAG_TYPE_NON_UNIT;
+        case diag::unit: return CUSPARSE_DIAG_TYPE_UNIT;
+        default:
+            throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_diag",
+                                                "Unknown diag: " + cast_enum_to_str(diag_val));
+    }
+inline void set_matrix_attributes(const std::string& func_name, cusparseSpMatDescr_t cu_a,
+                                  oneapi::mkl::sparse::matrix_view A_view) {
+    auto cu_fill_mode = get_cuda_uplo(A_view.uplo_view);
+    auto status = cusparseSpMatSetAttribute(cu_a, CUSPARSE_SPMAT_FILL_MODE, &cu_fill_mode,
+                                            sizeof(cu_fill_mode));
+    check_status(status, func_name + "/set_uplo");
+    auto cu_diag_type = get_cuda_diag(A_view.diag_view);
+    status = cusparseSpMatSetAttribute(cu_a, CUSPARSE_SPMAT_DIAG_TYPE, &cu_diag_type,
+                                       sizeof(cu_diag_type));
+    check_status(status, func_name + "/set_diag");
+ * cuSPARSE requires to set the pointer mode for scalars parameters (typically alpha and beta).
+ */
+inline void set_pointer_mode(cusparseHandle_t cu_handle, bool is_ptr_host_accessible) {
+    cusparseSetPointerMode(cu_handle, is_ptr_host_accessible ? CUSPARSE_POINTER_MODE_HOST
+                                                             : CUSPARSE_POINTER_MODE_DEVICE);
+} // namespace oneapi::mkl::sparse::cusparse::detail
diff --git a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp
new file mode 100644
index 000000000..4d92daf35
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp
@@ -0,0 +1,147 @@
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+ * @file Similar to cublas_scope_handle.cpp
+#include "cusparse_scope_handle.hpp"
+namespace oneapi::mkl::sparse::cusparse::detail {
+ * Inserts a new element in the map if its key is unique. This new element
+ * is constructed in place using args as the arguments for the construction
+ * of a value_type (which is an object of a pair type). The insertion only
+ * takes place if no other element in the container has a key equivalent to
+ * the one being emplaced (keys in a map container are unique).
+ */
+thread_local cusparse_global_handle<ur_context_handle_t>
+    CusparseScopedContextHandler::handle_helper = cusparse_global_handle<ur_context_handle_t>{};
+thread_local cusparse_global_handle<pi_context> CusparseScopedContextHandler::handle_helper =
+    cusparse_global_handle<pi_context>{};
+CusparseScopedContextHandler::CusparseScopedContextHandler(sycl::queue queue,
+                                                           sycl::interop_handle& ih)
+        : ih(ih),
+          needToRecover_(false) {
+    placedContext_ = new sycl::context(queue.get_context());
+    auto cudaDevice = ih.get_native_device<sycl::backend::ext_oneapi_cuda>();
+    CUcontext desired;
+    CUDA_ERROR_FUNC(cuCtxGetCurrent, &original_);
+    CUDA_ERROR_FUNC(cuDevicePrimaryCtxRetain, &desired, cudaDevice);
+    if (original_ != desired) {
+        // Sets the desired context as the active one for the thread
+        CUDA_ERROR_FUNC(cuCtxSetCurrent, desired);
+        // No context is installed and the suggested context is primary
+        // This is the most common case. We can activate the context in the
+        // thread and leave it there until all the PI context referring to the
+        // same underlying CUDA primary context are destroyed. This emulates
+        // the behaviour of the CUDA runtime api, and avoids costly context
+        // switches. No action is required on this side of the if.
+        needToRecover_ = !(original_ == nullptr);
+    }
+CusparseScopedContextHandler::~CusparseScopedContextHandler() noexcept(false) {
+    if (needToRecover_) {
+        CUDA_ERROR_FUNC(cuCtxSetCurrent, original_);
+    }
+    delete placedContext_;
+void ContextCallback(void* userData) {
+    auto* ptr = static_cast<std::atomic<cusparseHandle_t>*>(userData);
+    if (!ptr) {
+        return;
+    }
+    auto handle = ptr->exchange(nullptr);
+    if (handle != nullptr) {
+        CUSPARSE_ERR_FUNC(cusparseDestroy, handle);
+        handle = nullptr;
+    }
+    else {
+        // if the handle is nullptr it means the handle was already destroyed by
+        // the cusparse_global_handle destructor and we're free to delete the atomic
+        // object.
+        delete ptr;
+    }
+std::pair<cusparseHandle_t, CUstream> CusparseScopedContextHandler::get_handle_and_stream(
+    const sycl::queue& queue) {
+    auto cudaDevice = ih.get_native_device<sycl::backend::ext_oneapi_cuda>();
+    CUcontext desired;
+    CUDA_ERROR_FUNC(cuDevicePrimaryCtxRetain, &desired, cudaDevice);
+    auto piPlacedContext_ = reinterpret_cast<ur_context_handle_t>(desired);
+    auto piPlacedContext_ = reinterpret_cast<pi_context>(desired);
+    CUstream streamId = get_stream(queue);
+    auto it = handle_helper.cusparse_global_handle_mapper_.find(piPlacedContext_);
+    if (it != handle_helper.cusparse_global_handle_mapper_.end()) {
+        if (it->second == nullptr) {
+            handle_helper.cusparse_global_handle_mapper_.erase(it);
+        }
+        else {
+            auto handle = it->second->load();
+            if (handle != nullptr) {
+                cudaStream_t currentStreamId;
+                CUSPARSE_ERR_FUNC(cusparseGetStream, handle, &currentStreamId);
+                if (currentStreamId != streamId) {
+                    CUSPARSE_ERR_FUNC(cusparseSetStream, handle, streamId);
+                }
+                return { handle, streamId };
+            }
+            else {
+                handle_helper.cusparse_global_handle_mapper_.erase(it);
+            }
+        }
+    }
+    cusparseHandle_t handle;
+    CUSPARSE_ERR_FUNC(cusparseCreate, &handle);
+    CUSPARSE_ERR_FUNC(cusparseSetStream, handle, streamId);
+    auto insert_iter = handle_helper.cusparse_global_handle_mapper_.insert(
+        std::make_pair(piPlacedContext_, new std::atomic<cusparseHandle_t>(handle)));
+    sycl::detail::pi::contextSetExtendedDeleter(*placedContext_, ContextCallback,
+                                                insert_iter.first->second);
+    return { handle, streamId };
+cusparseHandle_t CusparseScopedContextHandler::get_handle(const sycl::queue& queue) {
+    return get_handle_and_stream(queue).first;
+CUstream CusparseScopedContextHandler::get_stream(const sycl::queue& queue) {
+    return sycl::get_native<sycl::backend::ext_oneapi_cuda>(queue);
+sycl::context CusparseScopedContextHandler::get_context(const sycl::queue& queue) {
+    return queue.get_context();
+} // namespace oneapi::mkl::sparse::cusparse::detail
diff --git a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp
new file mode 100644
index 000000000..7b8313ee6
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp
@@ -0,0 +1,88 @@
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+ * @file Similar to cublas_scope_handle.hpp
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#include <CL/sycl.hpp>
+// After Plugin Interface removal in DPC++ ur.hpp is the new include
+#if __has_include(<sycl/detail/ur.hpp>) && !defined(ONEAPI_ONEMKL_PI_INTERFACE_REMOVED)
+#include <thread>
+#include "cusparse_error.hpp"
+#include "cusparse_global_handle.hpp"
+#include "cusparse_helper.hpp"
+namespace oneapi::mkl::sparse::cusparse::detail {
+class CusparseScopedContextHandler {
+    CUcontext original_;
+    sycl::context* placedContext_;
+    sycl::interop_handle& ih;
+    bool needToRecover_;
+    static thread_local cusparse_global_handle<ur_context_handle_t> handle_helper;
+    static thread_local cusparse_global_handle<pi_context> handle_helper;
+    CUstream get_stream(const sycl::queue& queue);
+    sycl::context get_context(const sycl::queue& queue);
+    CusparseScopedContextHandler(sycl::queue queue, sycl::interop_handle& ih);
+    ~CusparseScopedContextHandler() noexcept(false);
+    /**
+     * @brief get_handle: creates the handle by implicitly impose the advice
+     * given by nvidia for creating a cusparse_global_handle. (e.g. one cuStream per device
+     * per thread).
+     * @param queue sycl queue.
+     * @return a pair of: cusparseHandle_t a handle to construct cusparse routines; and a CUDA stream
+     */
+    std::pair<cusparseHandle_t, CUstream> get_handle_and_stream(const sycl::queue& queue);
+    /// See get_handle_and_stream
+    cusparseHandle_t get_handle(const sycl::queue& queue);
+// Get the native pointer from an accessor. This is a different pointer than
+// what can be retrieved with get_multi_ptr.
+template <typename AccT>
+inline void* get_mem(sycl::interop_handle ih, AccT acc) {
+    auto cudaPtr = ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(acc);
+    return reinterpret_cast<void*>(cudaPtr);
+} // namespace oneapi::mkl::sparse::cusparse::detail
diff --git a/src/sparse_blas/backends/cusparse/cusparse_task.hpp b/src/sparse_blas/backends/cusparse/cusparse_task.hpp
new file mode 100644
index 000000000..0d86d642d
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_task.hpp
@@ -0,0 +1,431 @@
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+#include "cusparse_handles.hpp"
+#include "cusparse_scope_handle.hpp"
+/// This file provide a helper function to submit host_task using buffers or USM seamlessly
+namespace oneapi::mkl::sparse::cusparse::detail {
+template <typename T, typename Container>
+auto get_value_accessor(sycl::handler& cgh, Container container) {
+    auto buffer_ptr =
+        reinterpret_cast<sycl::buffer<T, 1>*>(container->value_container.buffer_ptr.get());
+    return buffer_ptr->template get_access<sycl::access::mode::read_write>(cgh);
+template <typename T, typename... Ts>
+auto get_fp_accessors(sycl::handler& cgh, Ts... containers) {
+    return std::array<sycl::accessor<T, 1>, sizeof...(containers)>{ get_value_accessor<T>(
+        cgh, containers)... };
+template <typename T>
+auto get_row_accessor(sycl::handler& cgh, matrix_handle_t smhandle) {
+    auto buffer_ptr =
+        reinterpret_cast<sycl::buffer<T, 1>*>(smhandle->row_container.buffer_ptr.get());
+    return buffer_ptr->template get_access<sycl::access::mode::read_write>(cgh);
+template <typename T>
+auto get_col_accessor(sycl::handler& cgh, matrix_handle_t smhandle) {
+    auto buffer_ptr =
+        reinterpret_cast<sycl::buffer<T, 1>*>(smhandle->col_container.buffer_ptr.get());
+    return buffer_ptr->template get_access<sycl::access::mode::read_write>(cgh);
+template <typename T>
+auto get_int_accessors(sycl::handler& cgh, matrix_handle_t smhandle) {
+    return std::array<sycl::accessor<T, 1>, 2>{ get_row_accessor<T>(cgh, smhandle),
+                                                get_col_accessor<T>(cgh, smhandle) };
+template <typename Functor, typename... CaptureOnlyAcc>
+void submit_host_task(sycl::handler& cgh, sycl::queue& queue, Functor functor,
+                      CaptureOnlyAcc... capture_only_accessors) {
+    // Only capture the accessors to ensure the dependencies are properly
+    // handled. The accessors's pointer have already been set to the native
+    // container types in previous functions. This assumes the underlying
+    // pointer of the buffer does not change. This is not guaranteed by the SYCL
+    // specification but should be true for all the implementations. This
+    // assumption avoids the overhead of resetting the pointer of all data
+    // handles for each enqueued command.
+    cgh.host_task([functor, queue, capture_only_accessors...](sycl::interop_handle ih) {
+        auto unused = std::make_tuple(capture_only_accessors...);
+        (void)unused;
+        functor(ih);
+    });
+template <typename Functor, typename... CaptureOnlyAcc>
+void submit_host_task_with_acc(sycl::handler& cgh, sycl::queue& queue, Functor functor,
+                               sycl::accessor<std::uint8_t> workspace_acc,
+                               CaptureOnlyAcc... capture_only_accessors) {
+    // Only capture the accessors to ensure the dependencies are properly
+    // handled. The accessors's pointer have already been set to the native
+    // container types in previous functions. This assumes the underlying
+    // pointer of the buffer does not change. This is not guaranteed by the SYCL
+    // specification but should be true for all the implementations. This
+    // assumption avoids the overhead of resetting the pointer of all data
+    // handles for each enqueued command.
+    cgh.host_task(
+        [functor, queue, workspace_acc, capture_only_accessors...](sycl::interop_handle ih) {
+            auto unused = std::make_tuple(capture_only_accessors...);
+            (void)unused;
+            functor(ih, workspace_acc);
+        });
+template <typename Functor, typename... CaptureOnlyAcc>
+void submit_native_command_ext(sycl::handler& cgh, sycl::queue& queue, Functor functor,
+                               const std::vector<sycl::event>& dependencies,
+                               CaptureOnlyAcc... capture_only_accessors) {
+    // Only capture the accessors to ensure the dependencies are properly
+    // handled. The accessors's pointer have already been set to the native
+    // container types in previous functions. This assumes the underlying
+    // pointer of the buffer does not change. This is not guaranteed by the SYCL
+    // specification but should be true for all the implementations. This
+    // assumption avoids the overhead of resetting the pointer of all data
+    // handles for each enqueued command.
+    cgh.ext_codeplay_enqueue_native_command(
+        [functor, queue, dependencies, capture_only_accessors...](sycl::interop_handle ih) {
+            auto unused = std::make_tuple(capture_only_accessors...);
+            (void)unused;
+            // The functor using ext_codeplay_enqueue_native_command need to
+            // explicitly wait on the events for the SPARSE domain. The
+            // extension ext_codeplay_enqueue_native_command is used to launch
+            // the compute operation which depends on the previous optimize
+            // step. In cuSPARSE the optimize step is synchronous but it is
+            // asynchronous in oneMKL Interface. The optimize step may not use
+            // the CUDA stream which would make it impossible for
+            // ext_codeplay_enqueue_native_command to automatically ensure it
+            // has completed before the compute function starts. These waits are
+            // used to ensure the optimize step has completed before starting
+            // the computation.
+            for (auto event : dependencies) {
+                event.wait();
+            }
+            functor(ih);
+        });
+    (void)dependencies;
+    submit_host_task(cgh, queue, functor, capture_only_accessors...);
+template <typename Functor, typename... CaptureOnlyAcc>
+void submit_native_command_ext_with_acc(sycl::handler& cgh, sycl::queue& queue, Functor functor,
+                                        const std::vector<sycl::event>& dependencies,
+                                        sycl::accessor<std::uint8_t> workspace_acc,
+                                        CaptureOnlyAcc... capture_only_accessors) {
+    // Only capture the accessors to ensure the dependencies are properly
+    // handled. The accessors's pointer have already been set to the native
+    // container types in previous functions. This assumes the underlying
+    // pointer of the buffer does not change. This is not guaranteed by the SYCL
+    // specification but should be true for all the implementations. This
+    // assumption avoids the overhead of resetting the pointer of all data
+    // handles for each enqueued command.
+    cgh.ext_codeplay_enqueue_native_command([functor, queue, dependencies, workspace_acc,
+                                             capture_only_accessors...](sycl::interop_handle ih) {
+        auto unused = std::make_tuple(capture_only_accessors...);
+        (void)unused;
+        // The functor using ext_codeplay_enqueue_native_command need to
+        // explicitly wait on the events for the SPARSE domain. The
+        // extension ext_codeplay_enqueue_native_command is used to launch
+        // the compute operation which depends on the previous optimize
+        // step. In cuSPARSE the optimize step is synchronous but it is
+        // asynchronous in oneMKL Interface. The optimize step may not use
+        // the CUDA stream which would make it impossible for
+        // ext_codeplay_enqueue_native_command to automatically ensure it
+        // has completed before the compute function starts. These waits are
+        // used to ensure the optimize step has completed before starting
+        // the computation.
+        for (auto event : dependencies) {
+            event.wait();
+        }
+        functor(ih, workspace_acc);
+    });
+    (void)dependencies;
+    submit_host_task_with_acc(cgh, queue, functor, workspace_acc, capture_only_accessors...);
+/// Helper submit functions to capture all accessors from the generic containers
+/// \p other_containers and ensure the dependencies of buffers are respected.
+/// The accessors are not directly used as the underlying data pointer has
+/// already been captured in previous functions.
+/// \p workspace_buffer is an optional buffer. Its accessor will be given to the
+/// functor as a last argument if \p UseWorkspace is true.
+/// \p UseWorkspace must be true to use the given \p workspace_buffer.
+/// \p UseEnqueueNativeCommandExt controls whether host_task are used or the
+/// extension ext_codeplay_enqueue_native_command is used to launch tasks. The
+/// extension should only be used for asynchronous functions using native
+/// backend's functions. The extension can only be used for in-order queues as
+/// the same cuStream needs to be used for the 3 steps to run an operation:
+/// querying the buffer size, optimizing and running the computation. This means
+/// a different cuStream can be used inside the native_command than the native
+/// cuStream used by the extension.
+template <bool UseWorkspace, bool UseEnqueueNativeCommandExt, typename Functor, typename... Ts>
+sycl::event dispatch_submit_impl_fp_int(const std::string& function_name, sycl::queue queue,
+                                        const std::vector<sycl::event>& dependencies,
+                                        Functor functor, matrix_handle_t sm_handle,
+                                        sycl::buffer<std::uint8_t> workspace_buffer,
+                                        Ts... other_containers) {
+    bool is_in_order_queue = queue.is_in_order();
+    if (sm_handle->all_use_buffer()) {
+        data_type value_type = sm_handle->get_value_type();
+        data_type int_type = sm_handle->get_int_type();
+#define ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, INT_TYPE)                                                 \
+    return queue.submit([&](sycl::handler& cgh) {                                                 \
+        cgh.depends_on(dependencies);                                                             \
+        auto fp_accs = get_fp_accessors<FP_TYPE>(cgh, sm_handle, other_containers...);            \
+        auto int_accs = get_int_accessors<INT_TYPE>(cgh, sm_handle);                              \
+        auto workspace_acc = workspace_buffer.get_access<sycl::access::mode::read_write>(cgh);    \
+        if constexpr (UseWorkspace) {                                                             \
+            if constexpr (UseEnqueueNativeCommandExt) {                                           \
+                if (is_in_order_queue) {                                                          \
+                    submit_native_command_ext_with_acc(cgh, queue, functor, dependencies,         \
+                                                       workspace_acc, fp_accs, int_accs);         \
+                }                                                                                 \
+                else {                                                                            \
+                    submit_host_task_with_acc(cgh, queue, functor, workspace_acc, fp_accs,        \
+                                              int_accs);                                          \
+                }                                                                                 \
+            }                                                                                     \
+            else {                                                                                \
+                submit_host_task_with_acc(cgh, queue, functor, workspace_acc, fp_accs, int_accs); \
+            }                                                                                     \
+        }                                                                                         \
+        else {                                                                                    \
+            (void)workspace_buffer;                                                               \
+            if constexpr (UseEnqueueNativeCommandExt) {                                           \
+                if (is_in_order_queue) {                                                          \
+                    submit_native_command_ext(cgh, queue, functor, dependencies, fp_accs,         \
+                                              int_accs);                                          \
+                }                                                                                 \
+                else {                                                                            \
+                    submit_host_task(cgh, queue, functor, fp_accs, int_accs);                     \
+                }                                                                                 \
+            }                                                                                     \
+            else {                                                                                \
+                submit_host_task(cgh, queue, functor, fp_accs, int_accs);                         \
+            }                                                                                     \
+        }                                                                                         \
+    })
+    if (int_type == data_type::int32) {                \
+        ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, std::int32_t); \
+    }                                                  \
+    else if (int_type == data_type::int64) {           \
+        ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, std::int64_t); \
+    }
+        if (value_type == data_type::real_fp32) {
+            ONEMKL_CUSPARSE_SUBMIT_INT(float)
+        }
+        else if (value_type == data_type::real_fp64) {
+            ONEMKL_CUSPARSE_SUBMIT_INT(double)
+        }
+        else if (value_type == data_type::complex_fp32) {
+            ONEMKL_CUSPARSE_SUBMIT_INT(std::complex<float>)
+        }
+        else if (value_type == data_type::complex_fp64) {
+            ONEMKL_CUSPARSE_SUBMIT_INT(std::complex<double>)
+        }
+        throw oneapi::mkl::exception("sparse_blas", function_name,
+                                     "Could not dispatch buffer kernel to a supported type");
+    }
+    else {
+        // USM submit does not need to capture accessors
+        if constexpr (!UseWorkspace) {
+            return queue.submit([&](sycl::handler& cgh) {
+                cgh.depends_on(dependencies);
+                if constexpr (UseEnqueueNativeCommandExt) {
+                    if (is_in_order_queue) {
+                        submit_native_command_ext(cgh, queue, functor, dependencies);
+                    }
+                    else {
+                        submit_host_task(cgh, queue, functor);
+                    }
+                }
+                else {
+                    submit_host_task(cgh, queue, functor);
+                }
+            });
+        }
+        else {
+            throw oneapi::mkl::exception("sparse_blas", function_name,
+                                         "Internal error: Cannot use accessor workspace with USM");
+        }
+    }
+/// Similar to dispatch_submit_impl_fp_int but only dispatches the host_task based on the floating point value type.
+template <typename Functor, typename ContainerT>
+sycl::event dispatch_submit_impl_fp(const std::string& function_name, sycl::queue queue,
+                                    const std::vector<sycl::event>& dependencies, Functor functor,
+                                    ContainerT container_handle) {
+    if (container_handle->all_use_buffer()) {
+        data_type value_type = container_handle->get_value_type();
+#define ONEMKL_CUSPARSE_SUBMIT(FP_TYPE)                                  \
+    return queue.submit([&](sycl::handler& cgh) {                        \
+        cgh.depends_on(dependencies);                                    \
+        auto fp_accs = get_fp_accessors<FP_TYPE>(cgh, container_handle); \
+        submit_host_task(cgh, queue, functor, fp_accs);                  \
+    })
+        if (value_type == data_type::real_fp32) {
+            ONEMKL_CUSPARSE_SUBMIT(float);
+        }
+        else if (value_type == data_type::real_fp64) {
+            ONEMKL_CUSPARSE_SUBMIT(double);
+        }
+        else if (value_type == data_type::complex_fp32) {
+            ONEMKL_CUSPARSE_SUBMIT(std::complex<float>);
+        }
+        else if (value_type == data_type::complex_fp64) {
+            ONEMKL_CUSPARSE_SUBMIT(std::complex<double>);
+        }
+        throw oneapi::mkl::exception("sparse_blas", function_name,
+                                     "Could not dispatch buffer kernel to a supported type");
+    }
+    else {
+        return queue.submit([&](sycl::handler& cgh) {
+            cgh.depends_on(dependencies);
+            submit_host_task(cgh, queue, functor);
+        });
+    }
+/// Helper function for dispatch_submit_impl_fp_int
+template <typename Functor, typename... Ts>
+sycl::event dispatch_submit(const std::string& function_name, sycl::queue queue, Functor functor,
+                            matrix_handle_t sm_handle, sycl::buffer<std::uint8_t> workspace_buffer,
+                            Ts... other_containers) {
+    constexpr bool UseWorkspace = true;
+    constexpr bool UseEnqueueNativeCommandExt = false;
+    return dispatch_submit_impl_fp_int<UseWorkspace, UseEnqueueNativeCommandExt>(
+        function_name, queue, {}, functor, sm_handle, workspace_buffer, other_containers...);
+/// Helper function for dispatch_submit_impl_fp_int
+template <typename Functor, typename... Ts>
+sycl::event dispatch_submit(const std::string& function_name, sycl::queue queue,
+                            const std::vector<sycl::event>& dependencies, Functor functor,
+                            matrix_handle_t sm_handle, Ts... other_containers) {
+    constexpr bool UseWorkspace = false;
+    constexpr bool UseEnqueueNativeCommandExt = false;
+    sycl::buffer<std::uint8_t> no_workspace(sycl::range<1>(0));
+    return dispatch_submit_impl_fp_int<UseWorkspace, UseEnqueueNativeCommandExt>(
+        function_name, queue, dependencies, functor, sm_handle, no_workspace, other_containers...);
+/// Helper function for dispatch_submit_impl_fp_int
+template <typename Functor, typename... Ts>
+sycl::event dispatch_submit(const std::string& function_name, sycl::queue queue, Functor functor,
+                            matrix_handle_t sm_handle, Ts... other_containers) {
+    constexpr bool UseWorkspace = false;
+    constexpr bool UseEnqueueNativeCommandExt = false;
+    sycl::buffer<std::uint8_t> no_workspace(sycl::range<1>(0));
+    return dispatch_submit_impl_fp_int<UseWorkspace, UseEnqueueNativeCommandExt>(
+        function_name, queue, {}, functor, sm_handle, no_workspace, other_containers...);
+/// Helper function for dispatch_submit_impl_fp_int
+template <typename Functor, typename... Ts>
+sycl::event dispatch_submit_native_ext(const std::string& function_name, sycl::queue queue,
+                                       Functor functor, matrix_handle_t sm_handle,
+                                       sycl::buffer<std::uint8_t> workspace_buffer,
+                                       Ts... other_containers) {
+    constexpr bool UseWorkspace = true;
+    constexpr bool UseEnqueueNativeCommandExt = true;
+    constexpr bool UseEnqueueNativeCommandExt = false;
+    return dispatch_submit_impl_fp_int<UseWorkspace, UseEnqueueNativeCommandExt>(
+        function_name, queue, {}, functor, sm_handle, workspace_buffer, other_containers...);
+/// Helper function for dispatch_submit_impl_fp_int
+template <typename Functor, typename... Ts>
+sycl::event dispatch_submit_native_ext(const std::string& function_name, sycl::queue queue,
+                                       const std::vector<sycl::event>& dependencies,
+                                       Functor functor, matrix_handle_t sm_handle,
+                                       Ts... other_containers) {
+    constexpr bool UseWorkspace = false;
+    constexpr bool UseEnqueueNativeCommandExt = true;
+    constexpr bool UseEnqueueNativeCommandExt = false;
+    sycl::buffer<std::uint8_t> no_workspace(sycl::range<1>(0));
+    return dispatch_submit_impl_fp_int<UseWorkspace, UseEnqueueNativeCommandExt>(
+        function_name, queue, dependencies, functor, sm_handle, no_workspace, other_containers...);
+/// Helper function for dispatch_submit_impl_fp_int
+template <typename Functor, typename... Ts>
+sycl::event dispatch_submit_native_ext(const std::string& function_name, sycl::queue queue,
+                                       Functor functor, matrix_handle_t sm_handle,
+                                       Ts... other_containers) {
+    constexpr bool UseWorkspace = false;
+    constexpr bool UseEnqueueNativeCommandExt = true;
+    constexpr bool UseEnqueueNativeCommandExt = false;
+    sycl::buffer<std::uint8_t> no_workspace(sycl::range<1>(0));
+    return dispatch_submit_impl_fp_int<UseWorkspace, UseEnqueueNativeCommandExt>(
+        function_name, queue, {}, functor, sm_handle, no_workspace, other_containers...);
+// Helper function for functors submitted to host_task or native_command.
+// When the extension is disabled, host_task are used and the synchronization is needed to ensure the sycl::event corresponds to the end of the whole functor.
+// When the extension is enabled, host_task are still used for out-of-order queues, see description of dispatch_submit_impl_fp_int.
+inline void synchronize_if_needed(bool is_in_order_queue, CUstream cu_stream) {
+    (void)is_in_order_queue;
+    CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream);
+    if (!is_in_order_queue) {
+        CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream);
+    }
+} // namespace oneapi::mkl::sparse::cusparse::detail
diff --git a/src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp b/src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp
new file mode 100644
index 000000000..278aec296
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp
@@ -0,0 +1,32 @@
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+#include "oneapi/mkl/sparse_blas/types.hpp"
+#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp"
+#include "sparse_blas/function_table.hpp"
+#define BACKEND         cusparse
+extern "C" sparse_blas_function_table_t mkl_sparse_blas_table = {
+#include "sparse_blas/backends/backend_wrappers.cxx"
diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp
new file mode 100644
index 000000000..5fd24d3f4
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp
@@ -0,0 +1,336 @@
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_error.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_helper.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_task.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_handles.hpp"
+#include "sparse_blas/common_op_verification.hpp"
+#include "sparse_blas/macros.hpp"
+#include "sparse_blas/matrix_view_comparison.hpp"
+#include "sparse_blas/sycl_helper.hpp"
+namespace oneapi::mkl::sparse {
+// Complete the definition of the incomplete type
+struct spmm_descr {
+    // Cache the CUstream and global handle to avoid relying on CusparseScopedContextHandler to retrieve them.
+    // cuSPARSE seem to implicitly require to use the same CUstream for a whole operation (buffer_size, optimization and computation steps).
+    // This is needed as the default SYCL queue is out-of-order which can have a different CUstream for each host_task or native_command.
+    CUstream cu_stream;
+    cusparseHandle_t cu_handle;
+    detail::generic_container workspace;
+    std::size_t temp_buffer_size = 0;
+    bool buffer_size_called = false;
+    bool optimized_called = false;
+    oneapi::mkl::transpose last_optimized_opA;
+    oneapi::mkl::transpose last_optimized_opB;
+    matrix_view last_optimized_A_view;
+    matrix_handle_t last_optimized_A_handle;
+    dense_matrix_handle_t last_optimized_B_handle;
+    dense_matrix_handle_t last_optimized_C_handle;
+    spmm_alg last_optimized_alg;
+} // namespace oneapi::mkl::sparse
+namespace oneapi::mkl::sparse::cusparse {
+namespace detail {
+inline auto get_cuda_spmm_alg(spmm_alg alg) {
+    switch (alg) {
+        case spmm_alg::coo_alg1: return CUSPARSE_SPMM_COO_ALG1;
+        case spmm_alg::coo_alg2: return CUSPARSE_SPMM_COO_ALG2;
+        case spmm_alg::coo_alg3: return CUSPARSE_SPMM_COO_ALG3;
+        case spmm_alg::coo_alg4: return CUSPARSE_SPMM_COO_ALG4;
+        case spmm_alg::csr_alg1: return CUSPARSE_SPMM_CSR_ALG1;
+        case spmm_alg::csr_alg2: return CUSPARSE_SPMM_CSR_ALG2;
+        case spmm_alg::csr_alg3: return CUSPARSE_SPMM_CSR_ALG3;
+        default: return CUSPARSE_SPMM_ALG_DEFAULT;
+    }
+void check_valid_spmm(const std::string& function_name, oneapi::mkl::transpose opA,
+                      oneapi::mkl::transpose opB, matrix_view A_view, matrix_handle_t A_handle,
+                      dense_matrix_handle_t B_handle, dense_matrix_handle_t C_handle,
+                      bool is_alpha_host_accessible, bool is_beta_host_accessible, spmm_alg alg) {
+    check_valid_spmm_common(function_name, A_view, A_handle, B_handle, C_handle,
+                            is_alpha_host_accessible, is_beta_host_accessible);
+    check_valid_matrix_properties(function_name, A_handle);
+    if (alg == spmm_alg::csr_alg3 && opA != oneapi::mkl::transpose::nontrans) {
+        throw mkl::unimplemented(
+            "sparse_blas", function_name,
+            "The backend does not support spmm with the algorithm `spmm_alg::csr_alg3` if `opA` is not `transpose::nontrans`.");
+    }
+    if (alg == spmm_alg::csr_alg3 && opB == oneapi::mkl::transpose::conjtrans) {
+        throw mkl::unimplemented(
+            "sparse_blas", function_name,
+            "The backend does not support spmm with the algorithm `spmm_alg::csr_alg3` if `opB` is `transpose::conjtrans`.");
+    }
+    if (alg == spmm_alg::csr_alg3 && opB == oneapi::mkl::transpose::trans &&
+        A_handle->get_value_type() == data_type::real_fp64) {
+        // TODO: Remove once the issue is fixed:
+        throw mkl::unimplemented(
+            "sparse_blas", function_name,
+            "The backend does not support spmm with the algorithm `spmm_alg::csr_alg3` if `opB` is `transpose::trans` and the real fp64 precision is used.");
+    }
+inline void common_spmm_optimize(oneapi::mkl::transpose opA, oneapi::mkl::transpose opB,
+                                 bool is_alpha_host_accessible, matrix_view A_view,
+                                 matrix_handle_t A_handle, dense_matrix_handle_t B_handle,
+                                 bool is_beta_host_accessible, dense_matrix_handle_t C_handle,
+                                 spmm_alg alg, spmm_descr_t spmm_descr) {
+    check_valid_spmm("spmm_optimize", opA, opB, A_view, A_handle, B_handle, C_handle,
+                     is_alpha_host_accessible, is_beta_host_accessible, alg);
+    if (!spmm_descr->buffer_size_called) {
+        throw mkl::uninitialized("sparse_blas", "spmm_optimize",
+                                 "spmm_buffer_size must be called before spmm_optimize.");
+    }
+    spmm_descr->optimized_called = true;
+    spmm_descr->last_optimized_opA = opA;
+    spmm_descr->last_optimized_opB = opB;
+    spmm_descr->last_optimized_A_view = A_view;
+    spmm_descr->last_optimized_A_handle = A_handle;
+    spmm_descr->last_optimized_B_handle = B_handle;
+    spmm_descr->last_optimized_C_handle = C_handle;
+    spmm_descr->last_optimized_alg = alg;
+void spmm_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA,
+                        oneapi::mkl::transpose opB, const void* alpha, matrix_handle_t A_handle,
+                        dense_matrix_handle_t B_handle, const void* beta,
+                        dense_matrix_handle_t C_handle, spmm_alg alg, void* workspace_ptr,
+                        bool is_alpha_host_accessible) {
+    auto cu_a = A_handle->backend_handle;
+    auto cu_b = B_handle->backend_handle;
+    auto cu_c = C_handle->backend_handle;
+    auto type = A_handle->value_container.data_type;
+    auto cu_op_a = get_cuda_operation(type, opA);
+    auto cu_op_b = get_cuda_operation(type, opB);
+    auto cu_type = get_cuda_value_type(type);
+    auto cu_alg = get_cuda_spmm_alg(alg);
+    set_pointer_mode(cu_handle, is_alpha_host_accessible);
+    auto status = cusparseSpMM_preprocess(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta,
+                                          cu_c, cu_type, cu_alg, workspace_ptr);
+    check_status(status, "spmm_optimize");
+} // namespace detail
+void init_spmm_descr(sycl::queue& /*queue*/, spmm_descr_t* p_spmm_descr) {
+    *p_spmm_descr = new spmm_descr();
+sycl::event release_spmm_descr(sycl::queue& queue, spmm_descr_t spmm_descr,
+                               const std::vector<sycl::event>& dependencies) {
+    if (!spmm_descr) {
+        return detail::collapse_dependencies(queue, dependencies);
+    }
+    auto release_functor = [=]() {
+        spmm_descr->cu_handle = nullptr;
+        spmm_descr->last_optimized_A_handle = nullptr;
+        spmm_descr->last_optimized_B_handle = nullptr;
+        spmm_descr->last_optimized_C_handle = nullptr;
+        delete spmm_descr;
+    };
+    // Use dispatch_submit to ensure the descriptor is kept alive as long as the buffers are used
+    // dispatch_submit can only be used if the descriptor's handles are valid
+    if (spmm_descr->last_optimized_A_handle &&
+        spmm_descr->last_optimized_A_handle->all_use_buffer() &&
+        spmm_descr->last_optimized_B_handle && spmm_descr->last_optimized_C_handle &&
+        spmm_descr->workspace.use_buffer()) {
+        auto dispatch_functor = [=](sycl::interop_handle, sycl::accessor<std::uint8_t>) {
+            release_functor();
+        };
+        return detail::dispatch_submit(
+            __func__, queue, dispatch_functor, spmm_descr->last_optimized_A_handle,
+            spmm_descr->workspace.get_buffer<std::uint8_t>(), spmm_descr->last_optimized_B_handle,
+            spmm_descr->last_optimized_C_handle);
+    }
+    // Release used if USM is used or if the descriptor has been released before spmm_optimize has succeeded
+    sycl::event event = queue.submit([&](sycl::handler& cgh) {
+        cgh.depends_on(dependencies);
+        cgh.host_task(release_functor);
+    });
+    return event;
+void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB,
+                      const void* alpha, matrix_view A_view, matrix_handle_t A_handle,
+                      dense_matrix_handle_t B_handle, const void* beta,
+                      dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr,
+                      std::size_t& temp_buffer_size) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    detail::check_valid_spmm(__func__, opA, opB, A_view, A_handle, B_handle, C_handle,
+                             is_alpha_host_accessible, is_beta_host_accessible, alg);
+    auto functor = [=, &temp_buffer_size](sycl::interop_handle ih) {
+        detail::CusparseScopedContextHandler sc(queue, ih);
+        auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue);
+        spmm_descr->cu_handle = cu_handle;
+        spmm_descr->cu_stream = cu_stream;
+        auto cu_a = A_handle->backend_handle;
+        auto cu_b = B_handle->backend_handle;
+        auto cu_c = C_handle->backend_handle;
+        auto type = A_handle->value_container.data_type;
+        auto cu_op_a = detail::get_cuda_operation(type, opA);
+        auto cu_op_b = detail::get_cuda_operation(type, opB);
+        auto cu_type = detail::get_cuda_value_type(type);
+        auto cu_alg = detail::get_cuda_spmm_alg(alg);
+        detail::set_pointer_mode(cu_handle, is_alpha_host_accessible);
+        auto status = cusparseSpMM_bufferSize(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta,
+                                              cu_c, cu_type, cu_alg, &temp_buffer_size);
+        detail::check_status(status, __func__);
+    };
+    auto event = detail::dispatch_submit(__func__, queue, functor, A_handle, B_handle, C_handle);
+    event.wait_and_throw();
+    spmm_descr->temp_buffer_size = temp_buffer_size;
+    spmm_descr->buffer_size_called = true;
+void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB,
+                   const void* alpha, matrix_view A_view, matrix_handle_t A_handle,
+                   dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle,
+                   spmm_alg alg, spmm_descr_t spmm_descr, sycl::buffer<std::uint8_t, 1> workspace) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    if (!A_handle->all_use_buffer()) {
+        detail::throw_incompatible_container(__func__);
+    }
+    detail::common_spmm_optimize(opA, opB, is_alpha_host_accessible, A_view, A_handle, B_handle,
+                                 is_beta_host_accessible, C_handle, alg, spmm_descr);
+    // Copy the buffer to extend its lifetime until the descriptor is free'd.
+    spmm_descr->workspace.set_buffer_untyped(workspace);
+    if (alg == spmm_alg::no_optimize_alg || workspace.size() == 0) {
+        // cusparseSpMM_preprocess cannot be called if the workspace is empty
+        return;
+    }
+    auto functor = [=](sycl::interop_handle ih, sycl::accessor<std::uint8_t> workspace_acc) {
+        auto cu_handle = spmm_descr->cu_handle;
+        auto workspace_ptr = detail::get_mem(ih, workspace_acc);
+        detail::spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle,
+                                   alg, workspace_ptr, is_alpha_host_accessible);
+    };
+    detail::dispatch_submit(__func__, queue, functor, A_handle, workspace, B_handle, C_handle);
+sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA,
+                          oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view,
+                          matrix_handle_t A_handle, dense_matrix_handle_t B_handle,
+                          const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg,
+                          spmm_descr_t spmm_descr, void* workspace,
+                          const std::vector<sycl::event>& dependencies) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    if (A_handle->all_use_buffer()) {
+        detail::throw_incompatible_container(__func__);
+    }
+    detail::common_spmm_optimize(opA, opB, is_alpha_host_accessible, A_view, A_handle, B_handle,
+                                 is_beta_host_accessible, C_handle, alg, spmm_descr);
+    spmm_descr->workspace.usm_ptr = workspace;
+    if (alg == spmm_alg::no_optimize_alg || workspace == nullptr) {
+        // cusparseSpMM_preprocess cannot be called if the workspace is empty
+        return detail::collapse_dependencies(queue, dependencies);
+    }
+    auto functor = [=](sycl::interop_handle) {
+        auto cu_handle = spmm_descr->cu_handle;
+        detail::spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle,
+                                   alg, workspace, is_alpha_host_accessible);
+    };
+    return detail::dispatch_submit(__func__, queue, dependencies, functor, A_handle, B_handle,
+                                   C_handle);
+sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB,
+                 const void* alpha, matrix_view A_view, matrix_handle_t A_handle,
+                 dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle,
+                 spmm_alg alg, spmm_descr_t spmm_descr,
+                 const std::vector<sycl::event>& dependencies) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    detail::check_valid_spmm(__func__, opA, opB, A_view, A_handle, B_handle, C_handle,
+                             is_alpha_host_accessible, is_beta_host_accessible, alg);
+    if (A_handle->all_use_buffer() != spmm_descr->workspace.use_buffer()) {
+        detail::throw_incompatible_container(__func__);
+    }
+    if (!spmm_descr->optimized_called) {
+        throw mkl::uninitialized("sparse_blas", __func__,
+                                 "spmm_optimize must be called before spmm.");
+    }
+    CHECK_DESCR_MATCH(spmm_descr, opA, "spmm_optimize");
+    CHECK_DESCR_MATCH(spmm_descr, opB, "spmm_optimize");
+    CHECK_DESCR_MATCH(spmm_descr, A_view, "spmm_optimize");
+    CHECK_DESCR_MATCH(spmm_descr, A_handle, "spmm_optimize");
+    CHECK_DESCR_MATCH(spmm_descr, B_handle, "spmm_optimize");
+    CHECK_DESCR_MATCH(spmm_descr, C_handle, "spmm_optimize");
+    CHECK_DESCR_MATCH(spmm_descr, alg, "spmm_optimize");
+    bool is_in_order_queue = queue.is_in_order();
+    auto compute_functor = [=](void* workspace_ptr) {
+        auto cu_handle = spmm_descr->cu_handle;
+        auto cu_a = A_handle->backend_handle;
+        auto cu_b = B_handle->backend_handle;
+        auto cu_c = C_handle->backend_handle;
+        auto type = A_handle->value_container.data_type;
+        auto cu_op_a = detail::get_cuda_operation(type, opA);
+        auto cu_op_b = detail::get_cuda_operation(type, opB);
+        auto cu_type = detail::get_cuda_value_type(type);
+        auto cu_alg = detail::get_cuda_spmm_alg(alg);
+        detail::set_pointer_mode(cu_handle, is_alpha_host_accessible);
+        auto status = cusparseSpMM(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta, cu_c,
+                                   cu_type, cu_alg, workspace_ptr);
+        detail::check_status(status, __func__);
+        detail::synchronize_if_needed(is_in_order_queue, spmm_descr->cu_stream);
+    };
+    if (A_handle->all_use_buffer() && spmm_descr->temp_buffer_size > 0) {
+        // The accessor can only be created if the buffer size is greater than 0
+        auto functor_buffer = [=](sycl::interop_handle ih,
+                                  sycl::accessor<std::uint8_t> workspace_acc) {
+            auto workspace_ptr = detail::get_mem(ih, workspace_acc);
+            compute_functor(workspace_ptr);
+        };
+        return detail::dispatch_submit_native_ext(__func__, queue, functor_buffer, A_handle,
+                                                  spmm_descr->workspace.get_buffer<std::uint8_t>(),
+                                                  B_handle, C_handle);
+    }
+    else {
+        // The same dispatch_submit can be used for USM or buffers if no
+        // workspace accessor is needed, workspace_ptr will be a nullptr in the
+        // latter case.
+        auto workspace_ptr = spmm_descr->workspace.usm_ptr;
+        auto functor_usm = [=](sycl::interop_handle) {
+            compute_functor(workspace_ptr);
+        };
+        return detail::dispatch_submit_native_ext(__func__, queue, dependencies, functor_usm,
+                                                  A_handle, B_handle, C_handle);
+    }
+} // namespace oneapi::mkl::sparse::cusparse
diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp
new file mode 100644
index 000000000..03b848916
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp
@@ -0,0 +1,335 @@
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_error.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_helper.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_task.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_handles.hpp"
+#include "sparse_blas/common_op_verification.hpp"
+#include "sparse_blas/macros.hpp"
+#include "sparse_blas/matrix_view_comparison.hpp"
+#include "sparse_blas/sycl_helper.hpp"
+namespace oneapi::mkl::sparse {
+// Complete the definition of the incomplete type
+struct spmv_descr {
+    // Cache the CUstream and global handle to avoid relying on CusparseScopedContextHandler to retrieve them.
+    // cuSPARSE seem to implicitly require to use the same CUstream for a whole operation (buffer_size, optimization and computation steps).
+    // This is needed as the default SYCL queue is out-of-order which can have a different CUstream for each host_task or native_command.
+    CUstream cu_stream;
+    cusparseHandle_t cu_handle;
+    detail::generic_container workspace;
+    std::size_t temp_buffer_size = 0;
+    bool buffer_size_called = false;
+    bool optimized_called = false;
+    oneapi::mkl::transpose last_optimized_opA;
+    matrix_view last_optimized_A_view;
+    matrix_handle_t last_optimized_A_handle;
+    dense_vector_handle_t last_optimized_x_handle;
+    dense_vector_handle_t last_optimized_y_handle;
+    spmv_alg last_optimized_alg;
+} // namespace oneapi::mkl::sparse
+namespace oneapi::mkl::sparse::cusparse {
+namespace detail {
+inline auto get_cuda_spmv_alg(spmv_alg alg) {
+    switch (alg) {
+        case spmv_alg::coo_alg1: return CUSPARSE_SPMV_COO_ALG1;
+        case spmv_alg::coo_alg2: return CUSPARSE_SPMV_COO_ALG2;
+        case spmv_alg::csr_alg1: return CUSPARSE_SPMV_CSR_ALG1;
+        case spmv_alg::csr_alg2: return CUSPARSE_SPMV_CSR_ALG2;
+        default: return CUSPARSE_SPMV_ALG_DEFAULT;
+    }
+void check_valid_spmv(const std::string& function_name, oneapi::mkl::transpose opA,
+                      matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+                      dense_vector_handle_t y_handle, bool is_alpha_host_accessible,
+                      bool is_beta_host_accessible) {
+    check_valid_spmv_common(function_name, opA, A_view, A_handle, x_handle, y_handle,
+                            is_alpha_host_accessible, is_beta_host_accessible);
+    check_valid_matrix_properties(function_name, A_handle);
+    if (A_view.type_view != matrix_descr::general) {
+        throw mkl::unimplemented(
+            "sparse_blas", function_name,
+            "The backend does not support spmv with a `type_view` other than `matrix_descr::general`.");
+    }
+inline void common_spmv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_accessible,
+                                 matrix_view A_view, matrix_handle_t A_handle,
+                                 dense_vector_handle_t x_handle, bool is_beta_host_accessible,
+                                 dense_vector_handle_t y_handle, spmv_alg alg,
+                                 spmv_descr_t spmv_descr) {
+    check_valid_spmv("spmv_optimize", opA, A_view, A_handle, x_handle, y_handle,
+                     is_alpha_host_accessible, is_beta_host_accessible);
+    if (!spmv_descr->buffer_size_called) {
+        throw mkl::uninitialized("sparse_blas", "spmv_optimize",
+                                 "spmv_buffer_size must be called before spmv_optimize.");
+    }
+    spmv_descr->optimized_called = true;
+    spmv_descr->last_optimized_opA = opA;
+    spmv_descr->last_optimized_A_view = A_view;
+    spmv_descr->last_optimized_A_handle = A_handle;
+    spmv_descr->last_optimized_x_handle = x_handle;
+    spmv_descr->last_optimized_y_handle = y_handle;
+    spmv_descr->last_optimized_alg = alg;
+#if CUSPARSE_VERSION >= 12300
+// cusparseSpMV_preprocess was added in cuSPARSE (CUDA 12.4)
+void spmv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, const void* alpha,
+                        matrix_handle_t A_handle, dense_vector_handle_t x_handle, const void* beta,
+                        dense_vector_handle_t y_handle, spmv_alg alg, void* workspace_ptr,
+                        bool is_alpha_host_accessible) {
+    auto cu_a = A_handle->backend_handle;
+    auto cu_x = x_handle->backend_handle;
+    auto cu_y = y_handle->backend_handle;
+    auto type = A_handle->value_container.data_type;
+    auto cu_op = get_cuda_operation(type, opA);
+    auto cu_type = get_cuda_value_type(type);
+    auto cu_alg = get_cuda_spmv_alg(alg);
+    set_pointer_mode(cu_handle, is_alpha_host_accessible);
+    auto status = cusparseSpMV_preprocess(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y, cu_type,
+                                          cu_alg, workspace_ptr);
+    check_status(status, "spmv_optimize");
+} // namespace detail
+void init_spmv_descr(sycl::queue& /*queue*/, spmv_descr_t* p_spmv_descr) {
+    *p_spmv_descr = new spmv_descr();
+sycl::event release_spmv_descr(sycl::queue& queue, spmv_descr_t spmv_descr,
+                               const std::vector<sycl::event>& dependencies) {
+    if (!spmv_descr) {
+        return detail::collapse_dependencies(queue, dependencies);
+    }
+    auto release_functor = [=]() {
+        spmv_descr->cu_handle = nullptr;
+        spmv_descr->last_optimized_A_handle = nullptr;
+        spmv_descr->last_optimized_x_handle = nullptr;
+        spmv_descr->last_optimized_y_handle = nullptr;
+        delete spmv_descr;
+    };
+    // Use dispatch_submit to ensure the descriptor is kept alive as long as the buffers are used
+    // dispatch_submit can only be used if the descriptor's handles are valid
+    if (spmv_descr->last_optimized_A_handle &&
+        spmv_descr->last_optimized_A_handle->all_use_buffer() &&
+        spmv_descr->last_optimized_x_handle && spmv_descr->last_optimized_y_handle &&
+        spmv_descr->workspace.use_buffer()) {
+        auto dispatch_functor = [=](sycl::interop_handle, sycl::accessor<std::uint8_t>) {
+            release_functor();
+        };
+        return detail::dispatch_submit(
+            __func__, queue, dispatch_functor, spmv_descr->last_optimized_A_handle,
+            spmv_descr->workspace.get_buffer<std::uint8_t>(), spmv_descr->last_optimized_x_handle,
+            spmv_descr->last_optimized_y_handle);
+    }
+    // Release used if USM is used or if the descriptor has been released before spmv_optimize has succeeded
+    sycl::event event = queue.submit([&](sycl::handler& cgh) {
+        cgh.depends_on(dependencies);
+        cgh.host_task(release_functor);
+    });
+    return event;
+void spmv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
+                      matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+                      const void* beta, dense_vector_handle_t y_handle, spmv_alg alg,
+                      spmv_descr_t spmv_descr, std::size_t& temp_buffer_size) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    detail::check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle,
+                             is_alpha_host_accessible, is_beta_host_accessible);
+    auto functor = [=, &temp_buffer_size](sycl::interop_handle ih) {
+        detail::CusparseScopedContextHandler sc(queue, ih);
+        auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue);
+        spmv_descr->cu_handle = cu_handle;
+        spmv_descr->cu_stream = cu_stream;
+        auto cu_a = A_handle->backend_handle;
+        auto cu_x = x_handle->backend_handle;
+        auto cu_y = y_handle->backend_handle;
+        auto type = A_handle->value_container.data_type;
+        auto cu_op = detail::get_cuda_operation(type, opA);
+        auto cu_type = detail::get_cuda_value_type(type);
+        auto cu_alg = detail::get_cuda_spmv_alg(alg);
+        detail::set_pointer_mode(cu_handle, is_alpha_host_accessible);
+        auto status = cusparseSpMV_bufferSize(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y,
+                                              cu_type, cu_alg, &temp_buffer_size);
+        detail::check_status(status, __func__);
+    };
+    auto event = detail::dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle);
+    event.wait_and_throw();
+    spmv_descr->temp_buffer_size = temp_buffer_size;
+    spmv_descr->buffer_size_called = true;
+void spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
+                   matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+                   const void* beta, dense_vector_handle_t y_handle, spmv_alg alg,
+                   spmv_descr_t spmv_descr, sycl::buffer<std::uint8_t, 1> workspace) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    if (!A_handle->all_use_buffer()) {
+        detail::throw_incompatible_container(__func__);
+    }
+    detail::common_spmv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle,
+                                 is_beta_host_accessible, y_handle, alg, spmv_descr);
+    // Copy the buffer to extend its lifetime until the descriptor is free'd.
+    spmv_descr->workspace.set_buffer_untyped(workspace);
+    if (alg == spmv_alg::no_optimize_alg) {
+        return;
+    }
+    // cusparseSpMV_preprocess was added in cuSPARSE (CUDA 12.4)
+    return;
+    if (spmv_descr->temp_buffer_size > 0) {
+        auto functor = [=](sycl::interop_handle ih, sycl::accessor<std::uint8_t> workspace_acc) {
+            auto cu_handle = spmv_descr->cu_handle;
+            auto workspace_ptr = detail::get_mem(ih, workspace_acc);
+            detail::spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle,
+                                       alg, workspace_ptr, is_alpha_host_accessible);
+        };
+        // The accessor can only be created if the buffer size is greater than 0
+        detail::dispatch_submit(__func__, queue, functor, A_handle, workspace, x_handle, y_handle);
+    }
+    else {
+        auto functor = [=](sycl::interop_handle) {
+            auto cu_handle = spmv_descr->cu_handle;
+            detail::spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle,
+                                       alg, nullptr, is_alpha_host_accessible);
+        };
+        detail::dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle);
+    }
+sycl::event spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
+                          matrix_view A_view, matrix_handle_t A_handle,
+                          dense_vector_handle_t x_handle, const void* beta,
+                          dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr,
+                          void* workspace, const std::vector<sycl::event>& dependencies) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    if (A_handle->all_use_buffer()) {
+        detail::throw_incompatible_container(__func__);
+    }
+    detail::common_spmv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle,
+                                 is_beta_host_accessible, y_handle, alg, spmv_descr);
+    spmv_descr->workspace.usm_ptr = workspace;
+    if (alg == spmv_alg::no_optimize_alg) {
+        return detail::collapse_dependencies(queue, dependencies);
+    }
+    // cusparseSpMV_preprocess was added in cuSPARSE (CUDA 12.4)
+    return detail::collapse_dependencies(queue, dependencies);
+    auto functor = [=](sycl::interop_handle) {
+        auto cu_handle = spmv_descr->cu_handle;
+        detail::spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, alg,
+                                   workspace, is_alpha_host_accessible);
+    };
+    return detail::dispatch_submit(__func__, queue, dependencies, functor, A_handle, x_handle,
+                                   y_handle);
+sycl::event spmv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
+                 matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+                 const void* beta, dense_vector_handle_t y_handle, spmv_alg alg,
+                 spmv_descr_t spmv_descr, const std::vector<sycl::event>& dependencies) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    detail::check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle,
+                             is_alpha_host_accessible, is_beta_host_accessible);
+    if (A_handle->all_use_buffer() != spmv_descr->workspace.use_buffer()) {
+        detail::throw_incompatible_container(__func__);
+    }
+    if (!spmv_descr->optimized_called) {
+        throw mkl::uninitialized("sparse_blas", __func__,
+                                 "spmv_optimize must be called before spmv.");
+    }
+    CHECK_DESCR_MATCH(spmv_descr, opA, "spmv_optimize");
+    CHECK_DESCR_MATCH(spmv_descr, A_view, "spmv_optimize");
+    CHECK_DESCR_MATCH(spmv_descr, A_handle, "spmv_optimize");
+    CHECK_DESCR_MATCH(spmv_descr, x_handle, "spmv_optimize");
+    CHECK_DESCR_MATCH(spmv_descr, y_handle, "spmv_optimize");
+    CHECK_DESCR_MATCH(spmv_descr, alg, "spmv_optimize");
+    bool is_in_order_queue = queue.is_in_order();
+    auto compute_functor = [=](void* workspace_ptr) {
+        auto cu_handle = spmv_descr->cu_handle;
+        auto cu_a = A_handle->backend_handle;
+        auto cu_x = x_handle->backend_handle;
+        auto cu_y = y_handle->backend_handle;
+        auto type = A_handle->value_container.data_type;
+        auto cu_op = detail::get_cuda_operation(type, opA);
+        auto cu_type = detail::get_cuda_value_type(type);
+        auto cu_alg = detail::get_cuda_spmv_alg(alg);
+        detail::set_pointer_mode(cu_handle, is_alpha_host_accessible);
+        auto status = cusparseSpMV(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y, cu_type, cu_alg,
+                                   workspace_ptr);
+        detail::check_status(status, __func__);
+        detail::synchronize_if_needed(is_in_order_queue, spmv_descr->cu_stream);
+    };
+    if (A_handle->all_use_buffer() && spmv_descr->temp_buffer_size > 0) {
+        // The accessor can only be created if the buffer size is greater than 0
+        auto functor_buffer = [=](sycl::interop_handle ih,
+                                  sycl::accessor<std::uint8_t> workspace_acc) {
+            auto workspace_ptr = detail::get_mem(ih, workspace_acc);
+            compute_functor(workspace_ptr);
+        };
+        return detail::dispatch_submit_native_ext(__func__, queue, functor_buffer, A_handle,
+                                                  spmv_descr->workspace.get_buffer<std::uint8_t>(),
+                                                  x_handle, y_handle);
+    }
+    else {
+        // The same dispatch_submit can be used for USM or buffers if no
+        // workspace accessor is needed, workspace_ptr will be a nullptr in the
+        // latter case.
+        auto workspace_ptr = spmv_descr->workspace.usm_ptr;
+        auto functor_usm = [=](sycl::interop_handle) {
+            compute_functor(workspace_ptr);
+        };
+        return detail::dispatch_submit_native_ext(__func__, queue, dependencies, functor_usm,
+                                                  A_handle, x_handle, y_handle);
+    }
+} // namespace oneapi::mkl::sparse::cusparse
diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp
new file mode 100644
index 000000000..5c49df013
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp
@@ -0,0 +1,289 @@
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_error.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_helper.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_task.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_handles.hpp"
+#include "sparse_blas/common_op_verification.hpp"
+#include "sparse_blas/macros.hpp"
+#include "sparse_blas/matrix_view_comparison.hpp"
+#include "sparse_blas/sycl_helper.hpp"
+namespace oneapi::mkl::sparse {
+// Complete the definition of the incomplete type
+struct spsv_descr {
+    // Cache the CUstream and global handle to avoid relying on CusparseScopedContextHandler to retrieve them.
+    // cuSPARSE seem to implicitly require to use the same CUstream for a whole operation (buffer_size, optimization and computation steps).
+    // This is needed as the default SYCL queue is out-of-order which can have a different CUstream for each host_task or native_command.
+    CUstream cu_stream;
+    cusparseHandle_t cu_handle;
+    cusparseSpSVDescr_t cu_descr;
+    detail::generic_container workspace;
+    bool buffer_size_called = false;
+    bool optimized_called = false;
+    oneapi::mkl::transpose last_optimized_opA;
+    matrix_view last_optimized_A_view;
+    matrix_handle_t last_optimized_A_handle;
+    dense_vector_handle_t last_optimized_x_handle;
+    dense_vector_handle_t last_optimized_y_handle;
+    spsv_alg last_optimized_alg;
+} // namespace oneapi::mkl::sparse
+namespace oneapi::mkl::sparse::cusparse {
+namespace detail {
+inline auto get_cuda_spsv_alg(spsv_alg /*alg*/) {
+void check_valid_spsv(const std::string& function_name, matrix_view A_view,
+                      matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+                      dense_vector_handle_t y_handle, bool is_alpha_host_accessible) {
+    check_valid_spsv_common(function_name, A_view, A_handle, x_handle, y_handle,
+                            is_alpha_host_accessible);
+    check_valid_matrix_properties(function_name, A_handle);
+inline void common_spsv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_accessible,
+                                 matrix_view A_view, matrix_handle_t A_handle,
+                                 dense_vector_handle_t x_handle, dense_vector_handle_t y_handle,
+                                 spsv_alg alg, spsv_descr_t spsv_descr) {
+    check_valid_spsv("spsv_optimize", A_view, A_handle, x_handle, y_handle,
+                     is_alpha_host_accessible);
+    if (!spsv_descr->buffer_size_called) {
+        throw mkl::uninitialized("sparse_blas", "spsv_optimize",
+                                 "spsv_buffer_size must be called before spsv_optimize.");
+    }
+    spsv_descr->optimized_called = true;
+    spsv_descr->last_optimized_opA = opA;
+    spsv_descr->last_optimized_A_view = A_view;
+    spsv_descr->last_optimized_A_handle = A_handle;
+    spsv_descr->last_optimized_x_handle = x_handle;
+    spsv_descr->last_optimized_y_handle = y_handle;
+    spsv_descr->last_optimized_alg = alg;
+void spsv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, const void* alpha,
+                        matrix_view A_view, matrix_handle_t A_handle,
+                        dense_vector_handle_t x_handle, dense_vector_handle_t y_handle,
+                        spsv_alg alg, spsv_descr_t spsv_descr, void* workspace_ptr,
+                        bool is_alpha_host_accessible) {
+    auto cu_a = A_handle->backend_handle;
+    auto cu_x = x_handle->backend_handle;
+    auto cu_y = y_handle->backend_handle;
+    auto type = A_handle->value_container.data_type;
+    set_matrix_attributes("spsv_optimize", cu_a, A_view);
+    auto cu_op = get_cuda_operation(type, opA);
+    auto cu_type = get_cuda_value_type(type);
+    auto cu_alg = get_cuda_spsv_alg(alg);
+    auto cu_descr = spsv_descr->cu_descr;
+    set_pointer_mode(cu_handle, is_alpha_host_accessible);
+    auto status = cusparseSpSV_analysis(cu_handle, cu_op, alpha, cu_a, cu_x, cu_y, cu_type, cu_alg,
+                                        cu_descr, workspace_ptr);
+    check_status(status, "spsv_optimize");
+} // namespace detail
+void init_spsv_descr(sycl::queue& /*queue*/, spsv_descr_t* p_spsv_descr) {
+    *p_spsv_descr = new spsv_descr();
+    CUSPARSE_ERR_FUNC(cusparseSpSV_createDescr, &(*p_spsv_descr)->cu_descr);
+sycl::event release_spsv_descr(sycl::queue& queue, spsv_descr_t spsv_descr,
+                               const std::vector<sycl::event>& dependencies) {
+    if (!spsv_descr) {
+        return detail::collapse_dependencies(queue, dependencies);
+    }
+    auto release_functor = [=]() {
+        CUSPARSE_ERR_FUNC(cusparseSpSV_destroyDescr, spsv_descr->cu_descr);
+        spsv_descr->cu_handle = nullptr;
+        spsv_descr->cu_descr = nullptr;
+        spsv_descr->last_optimized_A_handle = nullptr;
+        spsv_descr->last_optimized_x_handle = nullptr;
+        spsv_descr->last_optimized_y_handle = nullptr;
+        delete spsv_descr;
+    };
+    // Use dispatch_submit to ensure the descriptor is kept alive as long as the buffers are used
+    // dispatch_submit can only be used if the descriptor's handles are valid
+    if (spsv_descr->last_optimized_A_handle &&
+        spsv_descr->last_optimized_A_handle->all_use_buffer() &&
+        spsv_descr->last_optimized_x_handle && spsv_descr->last_optimized_y_handle &&
+        spsv_descr->workspace.use_buffer()) {
+        auto dispatch_functor = [=](sycl::interop_handle, sycl::accessor<std::uint8_t>) {
+            release_functor();
+        };
+        return detail::dispatch_submit(
+            __func__, queue, dispatch_functor, spsv_descr->last_optimized_A_handle,
+            spsv_descr->workspace.get_buffer<std::uint8_t>(), spsv_descr->last_optimized_x_handle,
+            spsv_descr->last_optimized_y_handle);
+    }
+    // Release used if USM is used or if the descriptor has been released before spsv_optimize has succeeded
+    sycl::event event = queue.submit([&](sycl::handler& cgh) {
+        cgh.depends_on(dependencies);
+        cgh.host_task(release_functor);
+    });
+    return event;
+void spsv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
+                      matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+                      dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr,
+                      std::size_t& temp_buffer_size) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    detail::check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle,
+                             is_alpha_host_accessible);
+    auto functor = [=, &temp_buffer_size](sycl::interop_handle ih) {
+        detail::CusparseScopedContextHandler sc(queue, ih);
+        auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue);
+        spsv_descr->cu_handle = cu_handle;
+        spsv_descr->cu_stream = cu_stream;
+        auto cu_a = A_handle->backend_handle;
+        auto cu_x = x_handle->backend_handle;
+        auto cu_y = y_handle->backend_handle;
+        auto type = A_handle->value_container.data_type;
+        detail::set_matrix_attributes(__func__, cu_a, A_view);
+        auto cu_op = detail::get_cuda_operation(type, opA);
+        auto cu_type = detail::get_cuda_value_type(type);
+        auto cu_alg = detail::get_cuda_spsv_alg(alg);
+        auto cu_descr = spsv_descr->cu_descr;
+        detail::set_pointer_mode(cu_handle, is_alpha_host_accessible);
+        auto status = cusparseSpSV_bufferSize(cu_handle, cu_op, alpha, cu_a, cu_x, cu_y, cu_type,
+                                              cu_alg, cu_descr, &temp_buffer_size);
+        detail::check_status(status, __func__);
+    };
+    auto event = detail::dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle);
+    event.wait_and_throw();
+    spsv_descr->buffer_size_called = true;
+void spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
+                   matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+                   dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr,
+                   sycl::buffer<std::uint8_t, 1> workspace) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    if (!A_handle->all_use_buffer()) {
+        detail::throw_incompatible_container(__func__);
+    }
+    detail::common_spsv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle,
+                                 y_handle, alg, spsv_descr);
+    // Ignore spsv_alg::no_optimize_alg as this step is mandatory for cuSPARSE
+    // Copy the buffer to extend its lifetime until the descriptor is free'd.
+    spsv_descr->workspace.set_buffer_untyped(workspace);
+    if (workspace.size() > 0) {
+        auto functor = [=](sycl::interop_handle ih, sycl::accessor<std::uint8_t> workspace_acc) {
+            auto cu_handle = spsv_descr->cu_handle;
+            auto workspace_ptr = detail::get_mem(ih, workspace_acc);
+            detail::spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle,
+                                       alg, spsv_descr, workspace_ptr, is_alpha_host_accessible);
+        };
+        // The accessor can only be created if the buffer size is greater than 0
+        detail::dispatch_submit(__func__, queue, functor, A_handle, workspace, x_handle, y_handle);
+    }
+    else {
+        auto functor = [=](sycl::interop_handle) {
+            auto cu_handle = spsv_descr->cu_handle;
+            detail::spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle,
+                                       alg, spsv_descr, nullptr, is_alpha_host_accessible);
+        };
+        detail::dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle);
+    }
+sycl::event spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
+                          matrix_view A_view, matrix_handle_t A_handle,
+                          dense_vector_handle_t x_handle, dense_vector_handle_t y_handle,
+                          spsv_alg alg, spsv_descr_t spsv_descr, void* workspace,
+                          const std::vector<sycl::event>& dependencies) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    if (A_handle->all_use_buffer()) {
+        detail::throw_incompatible_container(__func__);
+    }
+    detail::common_spsv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle,
+                                 y_handle, alg, spsv_descr);
+    // Ignore spsv_alg::no_optimize_alg as this step is mandatory for cuSPARSE
+    auto functor = [=](sycl::interop_handle) {
+        auto cu_handle = spsv_descr->cu_handle;
+        detail::spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, alg,
+                                   spsv_descr, workspace, is_alpha_host_accessible);
+    };
+    // No need to store the workspace USM pointer as the backend stores it already
+    return detail::dispatch_submit(__func__, queue, dependencies, functor, A_handle, x_handle,
+                                   y_handle);
+sycl::event spsv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
+                 matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+                 dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr,
+                 const std::vector<sycl::event>& dependencies) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    detail::check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle,
+                             is_alpha_host_accessible);
+    if (A_handle->all_use_buffer() != spsv_descr->workspace.use_buffer()) {
+        detail::throw_incompatible_container(__func__);
+    }
+    if (!spsv_descr->optimized_called) {
+        throw mkl::uninitialized("sparse_blas", __func__,
+                                 "spsv_optimize must be called before spsv.");
+    }
+    CHECK_DESCR_MATCH(spsv_descr, opA, "spsv_optimize");
+    CHECK_DESCR_MATCH(spsv_descr, A_view, "spsv_optimize");
+    CHECK_DESCR_MATCH(spsv_descr, A_handle, "spsv_optimize");
+    CHECK_DESCR_MATCH(spsv_descr, x_handle, "spsv_optimize");
+    CHECK_DESCR_MATCH(spsv_descr, y_handle, "spsv_optimize");
+    CHECK_DESCR_MATCH(spsv_descr, alg, "spsv_optimize");
+    bool is_in_order_queue = queue.is_in_order();
+    auto functor = [=](sycl::interop_handle) {
+        auto cu_handle = spsv_descr->cu_handle;
+        auto cu_a = A_handle->backend_handle;
+        auto cu_x = x_handle->backend_handle;
+        auto cu_y = y_handle->backend_handle;
+        auto type = A_handle->value_container.data_type;
+        detail::set_matrix_attributes(__func__, cu_a, A_view);
+        auto cu_op = detail::get_cuda_operation(type, opA);
+        auto cu_type = detail::get_cuda_value_type(type);
+        auto cu_alg = detail::get_cuda_spsv_alg(alg);
+        auto cu_descr = spsv_descr->cu_descr;
+        detail::set_pointer_mode(cu_handle, is_alpha_host_accessible);
+        auto status = cusparseSpSV_solve(cu_handle, cu_op, alpha, cu_a, cu_x, cu_y, cu_type, cu_alg,
+                                         cu_descr);
+        detail::check_status(status, __func__);
+        detail::synchronize_if_needed(is_in_order_queue, spsv_descr->cu_stream);
+    };
+    return detail::dispatch_submit_native_ext(__func__, queue, dependencies, functor, A_handle,
+                                              x_handle, y_handle);
+} // namespace oneapi::mkl::sparse::cusparse
diff --git a/src/sparse_blas/backends/mkl_common/mkl_dispatch.hpp b/src/sparse_blas/backends/mkl_common/mkl_dispatch.hpp
new file mode 100644
index 000000000..28c628438
--- /dev/null
+++ b/src/sparse_blas/backends/mkl_common/mkl_dispatch.hpp
@@ -0,0 +1,37 @@
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+/// Convert \p value_type to template type argument and use it to call \p op_functor.
+#define DISPATCH_MKL_OPERATION(function_name, value_type, op_functor, ...)                         \
+    switch (value_type) {                                                                          \
+        case detail::data_type::real_fp32: return op_functor<float>(__VA_ARGS__);                  \
+        case detail::data_type::real_fp64: return op_functor<double>(__VA_ARGS__);                 \
+        case detail::data_type::complex_fp32: return op_functor<std::complex<float>>(__VA_ARGS__); \
+        case detail::data_type::complex_fp64:                                                      \
+            return op_functor<std::complex<double>>(__VA_ARGS__);                                  \
+        default:                                                                                   \
+            throw oneapi::mkl::exception(                                                          \
+                "sparse_blas", function_name,                                                      \
+                "Internal error: unsupported type " + data_type_to_str(value_type));               \
+    }
diff --git a/src/sparse_blas/backends/mkl_common/mkl_handles.cxx b/src/sparse_blas/backends/mkl_common/mkl_handles.cxx
index 1e4ab95f1..5fa5ea0a4 100644
--- a/src/sparse_blas/backends/mkl_common/mkl_handles.cxx
+++ b/src/sparse_blas/backends/mkl_common/mkl_handles.cxx
@@ -17,102 +17,64 @@
+// In this file functions and types using the namespace oneapi::mkl::sparse:: refer to the backend's namespace for better readability.
 // Dense vector
 template <typename fpType>
-void init_dense_vector(sycl::queue& /*queue*/,
-                       oneapi::mkl::sparse::dense_vector_handle_t* p_dvhandle, std::int64_t size,
+void init_dense_vector(sycl::queue& /*queue*/, dense_vector_handle_t* p_dvhandle, std::int64_t size,
                        sycl::buffer<fpType, 1> val) {
-    *p_dvhandle = new oneapi::mkl::sparse::dense_vector_handle(val, size);
+    *p_dvhandle = new dense_vector_handle(val, size);
 template <typename fpType>
-void init_dense_vector(sycl::queue& /*queue*/,
-                       oneapi::mkl::sparse::dense_vector_handle_t* p_dvhandle, std::int64_t size,
+void init_dense_vector(sycl::queue& /*queue*/, dense_vector_handle_t* p_dvhandle, std::int64_t size,
                        fpType* val) {
-    *p_dvhandle = new oneapi::mkl::sparse::dense_vector_handle(val, size);
-template <typename fpType, typename InternalHandleT>
-void check_can_reset_value_handle(const std::string& function_name,
-                                  InternalHandleT* internal_handle, bool expect_buffer) {
-    if (internal_handle->get_value_type() != detail::get_data_type<fpType>()) {
-        throw oneapi::mkl::invalid_argument(
-            "sparse_blas", function_name,
-            "Incompatible data types expected " +
-                data_type_to_str(internal_handle->get_value_type()) + " but got " +
-                data_type_to_str(detail::get_data_type<fpType>()));
-    }
-    if (internal_handle->all_use_buffer() != expect_buffer) {
-        throw oneapi::mkl::invalid_argument(
-            "sparse_blas", function_name, "Cannot change the container type between buffer or USM");
-    }
+    *p_dvhandle = new dense_vector_handle(val, size);
 template <typename fpType>
-void set_dense_vector_data(sycl::queue& /*queue*/,
-                           oneapi::mkl::sparse::dense_vector_handle_t dvhandle, std::int64_t size,
-                           sycl::buffer<fpType, 1> val) {
-    check_can_reset_value_handle<fpType>(__func__, dvhandle, true);
+void set_dense_vector_data(sycl::queue& /*queue*/, dense_vector_handle_t dvhandle,
+                           std::int64_t size, sycl::buffer<fpType, 1> val) {
+    detail::check_can_reset_value_handle<fpType>(__func__, dvhandle, true);
     dvhandle->size = size;
 template <typename fpType>
-void set_dense_vector_data(sycl::queue& /*queue*/,
-                           oneapi::mkl::sparse::dense_vector_handle_t dvhandle, std::int64_t size,
-                           fpType* val) {
-    check_can_reset_value_handle<fpType>(__func__, dvhandle, false);
+void set_dense_vector_data(sycl::queue& /*queue*/, dense_vector_handle_t dvhandle,
+                           std::int64_t size, fpType* val) {
+    detail::check_can_reset_value_handle<fpType>(__func__, dvhandle, false);
     dvhandle->size = size;
-#define INSTANTIATE_DENSE_VECTOR_FUNCS(FP_TYPE, FP_SUFFIX)                            \
-    template void init_dense_vector<FP_TYPE>(                                         \
-        sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t * p_dvhandle, \
-        std::int64_t size, sycl::buffer<FP_TYPE, 1> val);                             \
-    template void init_dense_vector<FP_TYPE>(                                         \
-        sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t * p_dvhandle, \
-        std::int64_t size, FP_TYPE* val);                                             \
-    template void set_dense_vector_data<FP_TYPE>(                                     \
-        sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle,     \
-        std::int64_t size, sycl::buffer<FP_TYPE, 1> val);                             \
-    template void set_dense_vector_data<FP_TYPE>(                                     \
-        sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle,     \
-        std::int64_t size, FP_TYPE* val)
-sycl::event release_dense_vector(sycl::queue& queue,
-                                 oneapi::mkl::sparse::dense_vector_handle_t dvhandle,
+sycl::event release_dense_vector(sycl::queue& queue, dense_vector_handle_t dvhandle,
                                  const std::vector<sycl::event>& dependencies) {
     return detail::submit_release(queue, dvhandle, dependencies);
 // Dense matrix
 template <typename fpType>
-void init_dense_matrix(sycl::queue& /*queue*/,
-                       oneapi::mkl::sparse::dense_matrix_handle_t* p_dmhandle,
+void init_dense_matrix(sycl::queue& /*queue*/, dense_matrix_handle_t* p_dmhandle,
                        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,
                        oneapi::mkl::layout dense_layout, sycl::buffer<fpType, 1> val) {
-    *p_dmhandle =
-        new oneapi::mkl::sparse::dense_matrix_handle(val, num_rows, num_cols, ld, dense_layout);
+    *p_dmhandle = new dense_matrix_handle(val, num_rows, num_cols, ld, dense_layout);
 template <typename fpType>
-void init_dense_matrix(sycl::queue& /*queue*/,
-                       oneapi::mkl::sparse::dense_matrix_handle_t* p_dmhandle,
+void init_dense_matrix(sycl::queue& /*queue*/, dense_matrix_handle_t* p_dmhandle,
                        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,
                        oneapi::mkl::layout dense_layout, fpType* val) {
-    *p_dmhandle =
-        new oneapi::mkl::sparse::dense_matrix_handle(val, num_rows, num_cols, ld, dense_layout);
+    *p_dmhandle = new dense_matrix_handle(val, num_rows, num_cols, ld, dense_layout);
 template <typename fpType>
-void set_dense_matrix_data(sycl::queue& /*queue*/,
-                           oneapi::mkl::sparse::dense_matrix_handle_t dmhandle,
+void set_dense_matrix_data(sycl::queue& /*queue*/, dense_matrix_handle_t dmhandle,
                            std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,
                            oneapi::mkl::layout dense_layout, sycl::buffer<fpType, 1> val) {
-    check_can_reset_value_handle<fpType>(__func__, dmhandle, true);
+    detail::check_can_reset_value_handle<fpType>(__func__, dmhandle, true);
     dmhandle->num_rows = num_rows;
     dmhandle->num_cols = num_cols;
     dmhandle->ld = ld;
@@ -121,11 +83,10 @@ void set_dense_matrix_data(sycl::queue& /*queue*/,
 template <typename fpType>
-void set_dense_matrix_data(sycl::queue& /*queue*/,
-                           oneapi::mkl::sparse::dense_matrix_handle_t dmhandle,
+void set_dense_matrix_data(sycl::queue& /*queue*/, dense_matrix_handle_t dmhandle,
                            std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,
                            oneapi::mkl::layout dense_layout, fpType* val) {
-    check_can_reset_value_handle<fpType>(__func__, dmhandle, false);
+    detail::check_can_reset_value_handle<fpType>(__func__, dmhandle, false);
     dmhandle->num_rows = num_rows;
     dmhandle->num_cols = num_cols;
     dmhandle->ld = ld;
@@ -133,28 +94,9 @@ void set_dense_matrix_data(sycl::queue& /*queue*/,
-#define INSTANTIATE_DENSE_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX)                            \
-    template void init_dense_matrix<FP_TYPE>(                                         \
-        sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t * p_dmhandle, \
-        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,                \
-        oneapi::mkl::layout dense_layout, sycl::buffer<FP_TYPE, 1> val);              \
-    template void init_dense_matrix<FP_TYPE>(                                         \
-        sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t * p_dmhandle, \
-        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,                \
-        oneapi::mkl::layout dense_layout, FP_TYPE* val);                              \
-    template void set_dense_matrix_data<FP_TYPE>(                                     \
-        sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle,     \
-        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,                \
-        oneapi::mkl::layout dense_layout, sycl::buffer<FP_TYPE, 1> val);              \
-    template void set_dense_matrix_data<FP_TYPE>(                                     \
-        sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle,     \
-        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,                \
-        oneapi::mkl::layout dense_layout, FP_TYPE* val)
-sycl::event release_dense_matrix(sycl::queue& queue,
-                                 oneapi::mkl::sparse::dense_matrix_handle_t dmhandle,
+sycl::event release_dense_matrix(sycl::queue& queue, dense_matrix_handle_t dmhandle,
                                  const std::vector<sycl::event>& dependencies) {
     return detail::submit_release(queue, dmhandle, dependencies);
@@ -167,7 +109,9 @@ void init_coo_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t* p
                      sycl::buffer<intType, 1> col_ind, sycl::buffer<fpType, 1> val) {
     oneapi::mkl::sparse::matrix_handle_t mkl_handle;
-    auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val);
+    auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val,
+                                                              detail::sparse_format::COO, num_rows,
+                                                              num_cols, nnz, index);
     // The backend handle must use the buffers from the internal handle as they will be kept alive until the handle is released.
     oneapi::mkl::sparse::set_coo_data(queue, mkl_handle, static_cast<intType>(num_rows),
                                       static_cast<intType>(num_cols), static_cast<intType>(nnz),
@@ -184,7 +128,9 @@ void init_coo_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t* p
                      fpType* val) {
     oneapi::mkl::sparse::matrix_handle_t mkl_handle;
-    auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val);
+    auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val,
+                                                              detail::sparse_format::COO, num_rows,
+                                                              num_cols, nnz, index);
     auto event = oneapi::mkl::sparse::set_coo_data(
         queue, mkl_handle, static_cast<intType>(num_rows), static_cast<intType>(num_cols),
         static_cast<intType>(nnz), index, row_ind, col_ind, val);
@@ -192,32 +138,17 @@ void init_coo_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t* p
     *p_smhandle = reinterpret_cast<oneapi::mkl::sparse::matrix_handle_t>(internal_smhandle);
-template <typename fpType, typename intType>
-void check_can_reset_sparse_handle(const std::string& function_name,
-                                   detail::sparse_matrix_handle* internal_smhandle,
-                                   bool expect_buffer) {
-    check_can_reset_value_handle<fpType>(function_name, internal_smhandle, expect_buffer);
-    if (internal_smhandle->get_int_type() != detail::get_data_type<intType>()) {
-        throw oneapi::mkl::invalid_argument(
-            "sparse_blas", function_name,
-            "Incompatible data types expected " +
-                data_type_to_str(internal_smhandle->get_int_type()) + " but got " +
-                data_type_to_str(detail::get_data_type<intType>()));
-    }
-    if (!internal_smhandle->can_be_reset) {
-        throw mkl::unimplemented(
-            "sparse_blas/mkl", function_name,
-            "Reseting the matrix handle's data after it was used in a computation is not supported.");
-    }
 template <typename fpType, typename intType>
 void set_coo_matrix_data(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t smhandle,
                          std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,
                          oneapi::mkl::index_base index, sycl::buffer<intType, 1> row_ind,
                          sycl::buffer<intType, 1> col_ind, sycl::buffer<fpType, 1> val) {
     auto internal_smhandle = detail::get_internal_handle(smhandle);
-    check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, true);
+    detail::check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, true);
+    internal_smhandle->num_rows = num_rows;
+    internal_smhandle->num_cols = num_cols;
+    internal_smhandle->nnz = nnz;
+    internal_smhandle->index = index;
@@ -236,7 +167,11 @@ void set_coo_matrix_data(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_
                          oneapi::mkl::index_base index, intType* row_ind, intType* col_ind,
                          fpType* val) {
     auto internal_smhandle = detail::get_internal_handle(smhandle);
-    check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, false);
+    detail::check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, false);
+    internal_smhandle->num_rows = num_rows;
+    internal_smhandle->num_cols = num_cols;
+    internal_smhandle->nnz = nnz;
+    internal_smhandle->index = index;
@@ -246,37 +181,19 @@ void set_coo_matrix_data(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_
-    template void init_coo_matrix<FP_TYPE, INT_TYPE>(                                              \
-        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle,                    \
-        std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,                            \
-        oneapi::mkl::index_base index, sycl::buffer<INT_TYPE, 1> row_ind,                          \
-        sycl::buffer<INT_TYPE, 1> col_ind, sycl::buffer<FP_TYPE, 1> val);                          \
-    template void init_coo_matrix<FP_TYPE, INT_TYPE>(                                              \
-        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle,                    \
-        std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,                            \
-        oneapi::mkl::index_base index, INT_TYPE* row_ind, INT_TYPE* col_ind, FP_TYPE* val);        \
-    template void set_coo_matrix_data<FP_TYPE, INT_TYPE>(                                          \
-        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \
-        std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,                    \
-        sycl::buffer<INT_TYPE, 1> row_ind, sycl::buffer<INT_TYPE, 1> col_ind,                      \
-        sycl::buffer<FP_TYPE, 1> val);                                                             \
-    template void set_coo_matrix_data<FP_TYPE, INT_TYPE>(                                          \
-        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \
-        std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, INT_TYPE* row_ind, \
-        INT_TYPE* col_ind, FP_TYPE* val)
 // CSR matrix
 template <typename fpType, typename intType>
 void init_csr_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t* p_smhandle,
-                     std::int64_t num_rows, std::int64_t num_cols, std::int64_t /*nnz*/,
+                     std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,
                      oneapi::mkl::index_base index, sycl::buffer<intType, 1> row_ptr,
                      sycl::buffer<intType, 1> col_ind, sycl::buffer<fpType, 1> val) {
     oneapi::mkl::sparse::matrix_handle_t mkl_handle;
-    auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val);
+    auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val,
+                                                              detail::sparse_format::CSR, num_rows,
+                                                              num_cols, nnz, index);
     // The backend deduces nnz from row_ptr.
     // The backend handle must use the buffers from the internal handle as they will be kept alive until the handle is released.
     oneapi::mkl::sparse::set_csr_data(queue, mkl_handle, static_cast<intType>(num_rows),
@@ -289,12 +206,14 @@ void init_csr_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t* p
 template <typename fpType, typename intType>
 void init_csr_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t* p_smhandle,
-                     std::int64_t num_rows, std::int64_t num_cols, std::int64_t /*nnz*/,
+                     std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,
                      oneapi::mkl::index_base index, intType* row_ptr, intType* col_ind,
                      fpType* val) {
     oneapi::mkl::sparse::matrix_handle_t mkl_handle;
-    auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val);
+    auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val,
+                                                              detail::sparse_format::CSR, num_rows,
+                                                              num_cols, nnz, index);
     // The backend deduces nnz from row_ptr.
     auto event = oneapi::mkl::sparse::set_csr_data(
         queue, mkl_handle, static_cast<intType>(num_rows), static_cast<intType>(num_cols), index,
@@ -305,11 +224,15 @@ void init_csr_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t* p
 template <typename fpType, typename intType>
 void set_csr_matrix_data(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t smhandle,
-                         std::int64_t num_rows, std::int64_t num_cols, std::int64_t /*nnz*/,
+                         std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,
                          oneapi::mkl::index_base index, sycl::buffer<intType, 1> row_ptr,
                          sycl::buffer<intType, 1> col_ind, sycl::buffer<fpType, 1> val) {
     auto internal_smhandle = detail::get_internal_handle(smhandle);
-    check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, true);
+    detail::check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, true);
+    internal_smhandle->num_rows = num_rows;
+    internal_smhandle->num_cols = num_cols;
+    internal_smhandle->nnz = nnz;
+    internal_smhandle->index = index;
@@ -325,11 +248,15 @@ void set_csr_matrix_data(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_
 template <typename fpType, typename intType>
 void set_csr_matrix_data(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t smhandle,
-                         std::int64_t num_rows, std::int64_t num_cols, std::int64_t /*nnz*/,
+                         std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,
                          oneapi::mkl::index_base index, intType* row_ptr, intType* col_ind,
                          fpType* val) {
     auto internal_smhandle = detail::get_internal_handle(smhandle);
-    check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, false);
+    detail::check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, false);
+    internal_smhandle->num_rows = num_rows;
+    internal_smhandle->num_cols = num_cols;
+    internal_smhandle->nnz = nnz;
+    internal_smhandle->index = index;
@@ -340,27 +267,7 @@ void set_csr_matrix_data(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_
-    template void init_csr_matrix<FP_TYPE, INT_TYPE>(                                              \
-        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle,                    \
-        std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,                            \
-        oneapi::mkl::index_base index, sycl::buffer<INT_TYPE, 1> row_ptr,                          \
-        sycl::buffer<INT_TYPE, 1> col_ind, sycl::buffer<FP_TYPE, 1> val);                          \
-    template void init_csr_matrix<FP_TYPE, INT_TYPE>(                                              \
-        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle,                    \
-        std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,                            \
-        oneapi::mkl::index_base index, INT_TYPE* row_ptr, INT_TYPE* col_ind, FP_TYPE* val);        \
-    template void set_csr_matrix_data<FP_TYPE, INT_TYPE>(                                          \
-        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \
-        std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,                    \
-        sycl::buffer<INT_TYPE, 1> row_ptr, sycl::buffer<INT_TYPE, 1> col_ind,                      \
-        sycl::buffer<FP_TYPE, 1> val);                                                             \
-    template void set_csr_matrix_data<FP_TYPE, INT_TYPE>(                                          \
-        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \
-        std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, INT_TYPE* row_ptr, \
-        INT_TYPE* col_ind, FP_TYPE* val)
 // Common sparse matrix functions
 sycl::event release_sparse_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t smhandle,
@@ -369,22 +276,22 @@ sycl::event release_sparse_matrix(sycl::queue& queue, oneapi::mkl::sparse::matri
     // Asynchronously release the backend's handle followed by the internal handle.
     auto event = oneapi::mkl::sparse::release_matrix_handle(
         queue, &internal_smhandle->backend_handle, dependencies);
-    return detail::submit_release(queue, internal_smhandle, event);
+    return detail::submit_release(queue, internal_smhandle, { event });
 bool set_matrix_property(sycl::queue& /*queue*/, oneapi::mkl::sparse::matrix_handle_t smhandle,
-                         oneapi::mkl::sparse::matrix_property property) {
+                         matrix_property property) {
     auto internal_smhandle = detail::get_internal_handle(smhandle);
     // Store the matrix property internally for better error checking
     // Set the matrix property on the backend handle
     // Backend and oneMKL interface types for the property don't match
     switch (property) {
-        case oneapi::mkl::sparse::matrix_property::symmetric:
+        case matrix_property::symmetric:
             return true;
-        case oneapi::mkl::sparse::matrix_property::sorted:
+        case matrix_property::sorted:
             return true;
diff --git a/src/sparse_blas/backends/mkl_common/mkl_handles.hpp b/src/sparse_blas/backends/mkl_common/mkl_handles.hpp
index 44b12e8df..1bce0b8fb 100644
--- a/src/sparse_blas/backends/mkl_common/mkl_handles.hpp
+++ b/src/sparse_blas/backends/mkl_common/mkl_handles.hpp
@@ -26,6 +26,8 @@
 #include <oneapi/mkl/spblas.hpp>
 #include "sparse_blas/generic_container.hpp"
+#include "sparse_blas/macros.hpp"
+#include "sparse_blas/sycl_helper.hpp"
 namespace oneapi::mkl::sparse {
diff --git a/src/sparse_blas/backends/mkl_common/mkl_helper.hpp b/src/sparse_blas/backends/mkl_common/mkl_helper.hpp
deleted file mode 100644
index 99dc6707d..000000000
--- a/src/sparse_blas/backends/mkl_common/mkl_helper.hpp
+++ /dev/null
@@ -1,111 +0,0 @@
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#include <CL/sycl.hpp>
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/sparse_blas/detail/helper_types.hpp"
-#include "sparse_blas/enum_data_types.hpp"
-#include "sparse_blas/macros.hpp"
-namespace oneapi::mkl::sparse::detail {
-/// Return whether a pointer is accessible on the host
-template <typename T>
-inline bool is_ptr_accessible_on_host(sycl::queue& queue, const T* host_or_device_ptr) {
-    auto alloc_type = sycl::get_pointer_type(host_or_device_ptr, queue.get_context());
-    return alloc_type == sycl::usm::alloc::host || alloc_type == sycl::usm::alloc::shared ||
-           alloc_type == sycl::usm::alloc::unknown;
-/// Throw an exception if the scalar is not accessible in the host
-inline void check_ptr_is_host_accessible(const std::string& function_name,
-                                         const std::string& scalar_name,
-                                         bool is_ptr_accessible_on_host) {
-    if (!is_ptr_accessible_on_host) {
-        throw mkl::invalid_argument(
-            "sparse_blas", function_name,
-            "Scalar " + scalar_name + " must be accessible on the host for buffer functions.");
-    }
-/// Return a scalar on the host from a pointer to host or device memory
-/// Used for USM functions
-template <typename T>
-inline T get_scalar_on_host(sycl::queue& queue, const T* host_or_device_ptr,
-                            bool is_ptr_accessible_on_host) {
-    if (is_ptr_accessible_on_host) {
-        return *host_or_device_ptr;
-    }
-    T scalar;
-    auto event = queue.copy(host_or_device_ptr, &scalar, 1);
-    event.wait_and_throw();
-    return scalar;
-/// Merge multiple event dependencies into one
-inline sycl::event collapse_dependencies(sycl::queue& queue,
-                                         const std::vector<sycl::event>& dependencies) {
-    if (dependencies.empty()) {
-        return {};
-    }
-    else if (dependencies.size() == 1) {
-        return dependencies[0];
-    }
-    return queue.submit([&](sycl::handler& cgh) {
-        cgh.depends_on(dependencies);
-        cgh.host_task([=]() {});
-    });
-/// Convert \p value_type to template type argument and use it to call \p op_functor.
-#define DISPATCH_MKL_OPERATION(function_name, value_type, op_functor, ...)                         \
-    switch (value_type) {                                                                          \
-        case detail::data_type::real_fp32: return op_functor<float>(__VA_ARGS__);                  \
-        case detail::data_type::real_fp64: return op_functor<double>(__VA_ARGS__);                 \
-        case detail::data_type::complex_fp32: return op_functor<std::complex<float>>(__VA_ARGS__); \
-        case detail::data_type::complex_fp64:                                                      \
-            return op_functor<std::complex<double>>(__VA_ARGS__);                                  \
-        default:                                                                                   \
-            throw oneapi::mkl::exception(                                                          \
-                "sparse_blas", function_name,                                                      \
-                "Internal error: unsupported type " + data_type_to_str(value_type));               \
-    }
-#define CHECK_DESCR_MATCH(descr, argument, optimize_func_name)                                    \
-    do {                                                                                          \
-        if (descr->last_optimized_##argument != argument) {                                       \
-            throw mkl::invalid_argument(                                                          \
-                "sparse_blas", __func__,                                                          \
-                #argument " argument must match with the previous call to " #optimize_func_name); \
-        }                                                                                         \
-    } while (0)
-} // namespace oneapi::mkl::sparse::detail
diff --git a/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx b/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx
index 49987a202..9c0bc577b 100644
--- a/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx
+++ b/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx
@@ -17,6 +17,8 @@
+// In this file functions and types using the namespace oneapi::mkl::sparse:: refer to the backend's namespace for better readability.
 namespace oneapi::mkl::sparse {
 struct spmm_descr {
@@ -24,68 +26,40 @@ struct spmm_descr {
     bool optimized_called = false;
     oneapi::mkl::transpose last_optimized_opA;
     oneapi::mkl::transpose last_optimized_opB;
-    oneapi::mkl::sparse::matrix_view last_optimized_A_view;
-    oneapi::mkl::sparse::matrix_handle_t last_optimized_A_handle;
-    oneapi::mkl::sparse::dense_matrix_handle_t last_optimized_B_handle;
-    oneapi::mkl::sparse::dense_matrix_handle_t last_optimized_C_handle;
-    oneapi::mkl::sparse::spmm_alg last_optimized_alg;
+    matrix_view last_optimized_A_view;
+    matrix_handle_t last_optimized_A_handle;
+    dense_matrix_handle_t last_optimized_B_handle;
+    dense_matrix_handle_t last_optimized_C_handle;
+    spmm_alg last_optimized_alg;
 } // namespace oneapi::mkl::sparse
 namespace oneapi::mkl::sparse::BACKEND {
-void init_spmm_descr(sycl::queue& /*queue*/, oneapi::mkl::sparse::spmm_descr_t* p_spmm_descr) {
+void init_spmm_descr(sycl::queue& /*queue*/, spmm_descr_t* p_spmm_descr) {
     *p_spmm_descr = new spmm_descr();
-sycl::event release_spmm_descr(sycl::queue& queue, oneapi::mkl::sparse::spmm_descr_t spmm_descr,
+sycl::event release_spmm_descr(sycl::queue& queue, spmm_descr_t spmm_descr,
                                const std::vector<sycl::event>& dependencies) {
     return detail::submit_release(queue, spmm_descr, dependencies);
 void check_valid_spmm(const std::string& function_name, oneapi::mkl::transpose opA,
-                      oneapi::mkl::sparse::matrix_view A_view,
-                      oneapi::mkl::sparse::matrix_handle_t A_handle,
-                      oneapi::mkl::sparse::dense_matrix_handle_t B_handle,
-                      oneapi::mkl::sparse::dense_matrix_handle_t C_handle,
-                      bool is_alpha_host_accessible, bool is_beta_host_accessible) {
-    THROW_IF_NULLPTR(function_name, A_handle);
-    THROW_IF_NULLPTR(function_name, B_handle);
-    THROW_IF_NULLPTR(function_name, C_handle);
+                      matrix_view A_view, matrix_handle_t A_handle, dense_matrix_handle_t B_handle,
+                      dense_matrix_handle_t C_handle, bool is_alpha_host_accessible,
+                      bool is_beta_host_accessible) {
     auto internal_A_handle = detail::get_internal_handle(A_handle);
-    detail::check_all_containers_compatible(function_name, internal_A_handle, B_handle, C_handle);
-    if (internal_A_handle->all_use_buffer()) {
-        detail::check_ptr_is_host_accessible("spmm", "alpha", is_alpha_host_accessible);
-        detail::check_ptr_is_host_accessible("spmm", "beta", is_beta_host_accessible);
-    }
-    if (is_alpha_host_accessible != is_beta_host_accessible) {
-        throw mkl::invalid_argument(
-            "sparse_blas", function_name,
-            "Alpha and beta must both be placed on host memory or device memory.");
-    }
-    if (B_handle->dense_layout != C_handle->dense_layout) {
-        throw mkl::invalid_argument("sparse_blas", function_name,
-                                    "B and C matrices must used the same layout.");
-    }
-    if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::general) {
-        throw mkl::invalid_argument("sparse_blas", function_name,
-                                    "Matrix view's type must be `matrix_descr::general`.");
-    }
-    if (A_view.diag_view != oneapi::mkl::diag::nonunit) {
-        throw mkl::invalid_argument("sparse_blas", function_name,
-                                    "Matrix's diag_view must be `nonunit`.");
-    }
+    detail::check_valid_spmm_common(function_name, A_view, internal_A_handle, B_handle, C_handle,
+                                    is_alpha_host_accessible, is_beta_host_accessible);
 #if BACKEND == gpu
     detail::data_type data_type = internal_A_handle->get_value_type();
     if ((data_type == detail::data_type::complex_fp32 ||
          data_type == detail::data_type::complex_fp64) &&
         opA == oneapi::mkl::transpose::conjtrans &&
-        internal_A_handle->has_matrix_property(oneapi::mkl::sparse::matrix_property::symmetric)) {
+        internal_A_handle->has_matrix_property(matrix_property::symmetric)) {
         throw mkl::unimplemented(
             "sparse_blas", function_name,
             "The backend does not support spmm using conjtrans and the symmetric property.");
@@ -96,13 +70,10 @@ void check_valid_spmm(const std::string& function_name, oneapi::mkl::transpose o
 void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA,
-                      oneapi::mkl::transpose /*opB*/, const void* alpha,
-                      oneapi::mkl::sparse::matrix_view A_view,
-                      oneapi::mkl::sparse::matrix_handle_t A_handle,
-                      oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta,
-                      oneapi::mkl::sparse::dense_matrix_handle_t C_handle,
-                      oneapi::mkl::sparse::spmm_alg /*alg*/,
-                      oneapi::mkl::sparse::spmm_descr_t spmm_descr, std::size_t& temp_buffer_size) {
+                      oneapi::mkl::transpose /*opB*/, const void* alpha, matrix_view A_view,
+                      matrix_handle_t A_handle, dense_matrix_handle_t B_handle, const void* beta,
+                      dense_matrix_handle_t C_handle, spmm_alg /*alg*/, spmm_descr_t spmm_descr,
+                      std::size_t& temp_buffer_size) {
     // TODO: Add support for external workspace once the close-source oneMKL backend supports it.
     bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
     bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
@@ -112,12 +83,11 @@ void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA,
     spmm_descr->buffer_size_called = true;
-inline void common_spmm_optimize(
-    sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, const void* alpha,
-    oneapi::mkl::sparse::matrix_view A_view, oneapi::mkl::sparse::matrix_handle_t A_handle,
-    oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta,
-    oneapi::mkl::sparse::dense_matrix_handle_t C_handle, oneapi::mkl::sparse::spmm_alg alg,
-    oneapi::mkl::sparse::spmm_descr_t spmm_descr) {
+inline void common_spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA,
+                                 oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view,
+                                 matrix_handle_t A_handle, dense_matrix_handle_t B_handle,
+                                 const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg,
+                                 spmm_descr_t spmm_descr) {
     bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
     bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
     check_valid_spmm("spmm_optimize", opA, A_view, A_handle, B_handle, C_handle,
@@ -137,11 +107,9 @@ inline void common_spmm_optimize(
 void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB,
-                   const void* alpha, oneapi::mkl::sparse::matrix_view A_view,
-                   oneapi::mkl::sparse::matrix_handle_t A_handle,
-                   oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta,
-                   oneapi::mkl::sparse::dense_matrix_handle_t C_handle,
-                   oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::spmm_descr_t spmm_descr,
+                   const void* alpha, matrix_view A_view, matrix_handle_t A_handle,
+                   dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle,
+                   spmm_alg alg, spmm_descr_t spmm_descr,
                    sycl::buffer<std::uint8_t, 1> /*workspace*/) {
     auto internal_A_handle = detail::get_internal_handle(A_handle);
     if (!internal_A_handle->all_use_buffer()) {
@@ -149,7 +117,7 @@ void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::
     common_spmm_optimize(queue, opA, opB, alpha, A_view, A_handle, B_handle, beta, C_handle, alg,
-    if (alg == oneapi::mkl::sparse::spmm_alg::no_optimize_alg) {
+    if (alg == spmm_alg::no_optimize_alg) {
     internal_A_handle->can_be_reset = false;
@@ -157,13 +125,10 @@ void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::
 sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA,
-                          oneapi::mkl::transpose opB, const void* alpha,
-                          oneapi::mkl::sparse::matrix_view A_view,
-                          oneapi::mkl::sparse::matrix_handle_t A_handle,
-                          oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta,
-                          oneapi::mkl::sparse::dense_matrix_handle_t C_handle,
-                          oneapi::mkl::sparse::spmm_alg alg,
-                          oneapi::mkl::sparse::spmm_descr_t spmm_descr, void* /*workspace*/,
+                          oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view,
+                          matrix_handle_t A_handle, dense_matrix_handle_t B_handle,
+                          const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg,
+                          spmm_descr_t spmm_descr, void* /*workspace*/,
                           const std::vector<sycl::event>& dependencies) {
     auto internal_A_handle = detail::get_internal_handle(A_handle);
     if (internal_A_handle->all_use_buffer()) {
@@ -171,7 +136,7 @@ sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA,
     common_spmm_optimize(queue, opA, opB, alpha, A_view, A_handle, B_handle, beta, C_handle, alg,
-    if (alg == oneapi::mkl::sparse::spmm_alg::no_optimize_alg) {
+    if (alg == spmm_alg::no_optimize_alg) {
         return detail::collapse_dependencies(queue, dependencies);
     internal_A_handle->can_be_reset = false;
@@ -180,13 +145,12 @@ sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA,
 template <typename T>
-sycl::event internal_spmm(
-    sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, const void* alpha,
-    oneapi::mkl::sparse::matrix_view /*A_view*/, oneapi::mkl::sparse::matrix_handle_t A_handle,
-    oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta,
-    oneapi::mkl::sparse::dense_matrix_handle_t C_handle, oneapi::mkl::sparse::spmm_alg /*alg*/,
-    oneapi::mkl::sparse::spmm_descr_t /*spmm_descr*/, const std::vector<sycl::event>& dependencies,
-    bool is_alpha_host_accessible, bool is_beta_host_accessible) {
+sycl::event internal_spmm(sycl::queue& queue, oneapi::mkl::transpose opA,
+                          oneapi::mkl::transpose opB, const void* alpha, matrix_view /*A_view*/,
+                          matrix_handle_t A_handle, dense_matrix_handle_t B_handle,
+                          const void* beta, dense_matrix_handle_t C_handle, spmm_alg /*alg*/,
+                          spmm_descr_t /*spmm_descr*/, const std::vector<sycl::event>& dependencies,
+                          bool is_alpha_host_accessible, bool is_beta_host_accessible) {
     T host_alpha =
         detail::get_scalar_on_host(queue, static_cast<const T*>(alpha), is_alpha_host_accessible);
     T host_beta =
@@ -213,11 +177,9 @@ sycl::event internal_spmm(
 sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB,
-                 const void* alpha, oneapi::mkl::sparse::matrix_view A_view,
-                 oneapi::mkl::sparse::matrix_handle_t A_handle,
-                 oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta,
-                 oneapi::mkl::sparse::dense_matrix_handle_t C_handle,
-                 oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::spmm_descr_t spmm_descr,
+                 const void* alpha, matrix_view A_view, matrix_handle_t A_handle,
+                 dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle,
+                 spmm_alg alg, spmm_descr_t spmm_descr,
                  const std::vector<sycl::event>& dependencies) {
     bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
     bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
diff --git a/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx b/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx
index d5a24e9f4..9fc43d8e9 100644
--- a/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx
+++ b/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx
@@ -17,67 +17,44 @@
+// In this file functions and types using the namespace oneapi::mkl::sparse:: refer to the backend's namespace for better readability.
 namespace oneapi::mkl::sparse {
 struct spmv_descr {
     bool buffer_size_called = false;
     bool optimized_called = false;
     oneapi::mkl::transpose last_optimized_opA;
-    oneapi::mkl::sparse::matrix_view last_optimized_A_view;
-    oneapi::mkl::sparse::matrix_handle_t last_optimized_A_handle;
-    oneapi::mkl::sparse::dense_vector_handle_t last_optimized_x_handle;
-    oneapi::mkl::sparse::dense_vector_handle_t last_optimized_y_handle;
-    oneapi::mkl::sparse::spmv_alg last_optimized_alg;
+    matrix_view last_optimized_A_view;
+    matrix_handle_t last_optimized_A_handle;
+    dense_vector_handle_t last_optimized_x_handle;
+    dense_vector_handle_t last_optimized_y_handle;
+    spmv_alg last_optimized_alg;
 } // namespace oneapi::mkl::sparse
 namespace oneapi::mkl::sparse::BACKEND {
-void init_spmv_descr(sycl::queue& /*queue*/, oneapi::mkl::sparse::spmv_descr_t* p_spmv_descr) {
+void init_spmv_descr(sycl::queue& /*queue*/, spmv_descr_t* p_spmv_descr) {
     *p_spmv_descr = new spmv_descr();
-sycl::event release_spmv_descr(sycl::queue& queue, oneapi::mkl::sparse::spmv_descr_t spmv_descr,
+sycl::event release_spmv_descr(sycl::queue& queue, spmv_descr_t spmv_descr,
                                const std::vector<sycl::event>& dependencies) {
     return detail::submit_release(queue, spmv_descr, dependencies);
 void check_valid_spmv(const std::string& function_name, oneapi::mkl::transpose opA,
-                      oneapi::mkl::sparse::matrix_view A_view,
-                      oneapi::mkl::sparse::matrix_handle_t A_handle,
-                      oneapi::mkl::sparse::dense_vector_handle_t x_handle,
-                      oneapi::mkl::sparse::dense_vector_handle_t y_handle,
-                      bool is_alpha_host_accessible, bool is_beta_host_accessible) {
-    THROW_IF_NULLPTR(function_name, A_handle);
-    THROW_IF_NULLPTR(function_name, x_handle);
-    THROW_IF_NULLPTR(function_name, y_handle);
+                      matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+                      dense_vector_handle_t y_handle, bool is_alpha_host_accessible,
+                      bool is_beta_host_accessible) {
     auto internal_A_handle = detail::get_internal_handle(A_handle);
-    detail::check_all_containers_compatible(function_name, internal_A_handle, x_handle, y_handle);
-    if (internal_A_handle->all_use_buffer()) {
-        detail::check_ptr_is_host_accessible("spmv", "alpha", is_alpha_host_accessible);
-        detail::check_ptr_is_host_accessible("spmv", "beta", is_beta_host_accessible);
-    }
-    if (is_alpha_host_accessible != is_beta_host_accessible) {
-        throw mkl::invalid_argument(
-            "sparse_blas", function_name,
-            "Alpha and beta must both be placed on host memory or device memory.");
-    }
-    if (A_view.type_view == oneapi::mkl::sparse::matrix_descr::diagonal) {
-        throw mkl::invalid_argument("sparse_blas", function_name,
-                                    "Matrix view's type cannot be diagonal.");
-    }
-    if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::triangular &&
-        A_view.diag_view == oneapi::mkl::diag::unit) {
-        throw mkl::invalid_argument(
-            "sparse_blas", function_name,
-            "`unit` diag_view can only be used with a triangular type_view.");
-    }
+    detail::check_valid_spmv_common(__func__, opA, A_view, internal_A_handle, x_handle, y_handle,
+                                    is_alpha_host_accessible, is_beta_host_accessible);
-    if ((A_view.type_view == oneapi::mkl::sparse::matrix_descr::symmetric ||
-         A_view.type_view == oneapi::mkl::sparse::matrix_descr::hermitian) &&
+    if ((A_view.type_view == matrix_descr::symmetric ||
+         A_view.type_view == matrix_descr::hermitian) &&
         opA == oneapi::mkl::transpose::conjtrans) {
         throw mkl::unimplemented(
             "sparse_blas", function_name,
@@ -86,12 +63,9 @@ void check_valid_spmv(const std::string& function_name, oneapi::mkl::transpose o
 void spmv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
-                      oneapi::mkl::sparse::matrix_view A_view,
-                      oneapi::mkl::sparse::matrix_handle_t A_handle,
-                      oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void* beta,
-                      oneapi::mkl::sparse::dense_vector_handle_t y_handle,
-                      oneapi::mkl::sparse::spmv_alg /*alg*/,
-                      oneapi::mkl::sparse::spmv_descr_t spmv_descr, std::size_t& temp_buffer_size) {
+                      matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+                      const void* beta, dense_vector_handle_t y_handle, spmv_alg /*alg*/,
+                      spmv_descr_t spmv_descr, std::size_t& temp_buffer_size) {
     // TODO: Add support for external workspace once the close-source oneMKL backend supports it.
     bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
     bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
@@ -102,13 +76,10 @@ void spmv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void
 inline void common_spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
-                                 oneapi::mkl::sparse::matrix_view A_view,
-                                 oneapi::mkl::sparse::matrix_handle_t A_handle,
-                                 oneapi::mkl::sparse::dense_vector_handle_t x_handle,
-                                 const void* beta,
-                                 oneapi::mkl::sparse::dense_vector_handle_t y_handle,
-                                 oneapi::mkl::sparse::spmv_alg alg,
-                                 oneapi::mkl::sparse::spmv_descr_t spmv_descr) {
+                                 matrix_view A_view, matrix_handle_t A_handle,
+                                 dense_vector_handle_t x_handle, const void* beta,
+                                 dense_vector_handle_t y_handle, spmv_alg alg,
+                                 spmv_descr_t spmv_descr) {
     bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
     bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
     check_valid_spmv("spmv_optimize", opA, A_view, A_handle, x_handle, y_handle,
@@ -127,19 +98,16 @@ inline void common_spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA,
 void spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
-                   oneapi::mkl::sparse::matrix_view A_view,
-                   oneapi::mkl::sparse::matrix_handle_t A_handle,
-                   oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void* beta,
-                   oneapi::mkl::sparse::dense_vector_handle_t y_handle,
-                   oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::spmv_descr_t spmv_descr,
-                   sycl::buffer<std::uint8_t, 1> /*workspace*/) {
+                   matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+                   const void* beta, dense_vector_handle_t y_handle, spmv_alg alg,
+                   spmv_descr_t spmv_descr, sycl::buffer<std::uint8_t, 1> /*workspace*/) {
     auto internal_A_handle = detail::get_internal_handle(A_handle);
     if (!internal_A_handle->all_use_buffer()) {
     common_spmv_optimize(queue, opA, alpha, A_view, A_handle, x_handle, beta, y_handle, alg,
-    if (alg == oneapi::mkl::sparse::spmv_alg::no_optimize_alg) {
+    if (alg == spmv_alg::no_optimize_alg) {
     internal_A_handle->can_be_reset = false;
@@ -158,20 +126,17 @@ void spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* a
 sycl::event spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
-                          oneapi::mkl::sparse::matrix_view A_view,
-                          oneapi::mkl::sparse::matrix_handle_t A_handle,
-                          oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void* beta,
-                          oneapi::mkl::sparse::dense_vector_handle_t y_handle,
-                          oneapi::mkl::sparse::spmv_alg alg,
-                          oneapi::mkl::sparse::spmv_descr_t spmv_descr, void* /*workspace*/,
-                          const std::vector<sycl::event>& dependencies) {
+                          matrix_view A_view, matrix_handle_t A_handle,
+                          dense_vector_handle_t x_handle, const void* beta,
+                          dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr,
+                          void* /*workspace*/, const std::vector<sycl::event>& dependencies) {
     auto internal_A_handle = detail::get_internal_handle(A_handle);
     if (internal_A_handle->all_use_buffer()) {
     common_spmv_optimize(queue, opA, alpha, A_view, A_handle, x_handle, beta, y_handle, alg,
-    if (alg == oneapi::mkl::sparse::spmv_alg::no_optimize_alg) {
+    if (alg == spmv_alg::no_optimize_alg) {
         return detail::collapse_dependencies(queue, dependencies);
     internal_A_handle->can_be_reset = false;
@@ -191,13 +156,10 @@ sycl::event spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const
 template <typename T>
 sycl::event internal_spmv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
-                          oneapi::mkl::sparse::matrix_view A_view,
-                          oneapi::mkl::sparse::matrix_handle_t A_handle,
-                          oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void* beta,
-                          oneapi::mkl::sparse::dense_vector_handle_t y_handle,
-                          oneapi::mkl::sparse::spmv_alg /*alg*/,
-                          oneapi::mkl::sparse::spmv_descr_t /*spmv_descr*/,
-                          const std::vector<sycl::event>& dependencies,
+                          matrix_view A_view, matrix_handle_t A_handle,
+                          dense_vector_handle_t x_handle, const void* beta,
+                          dense_vector_handle_t y_handle, spmv_alg /*alg*/,
+                          spmv_descr_t /*spmv_descr*/, const std::vector<sycl::event>& dependencies,
                           bool is_alpha_host_accessible, bool is_beta_host_accessible) {
     T host_alpha =
         detail::get_scalar_on_host(queue, static_cast<const T*>(alpha), is_alpha_host_accessible);
@@ -246,12 +208,9 @@ sycl::event internal_spmv(sycl::queue& queue, oneapi::mkl::transpose opA, const
 sycl::event spmv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
-                 oneapi::mkl::sparse::matrix_view A_view,
-                 oneapi::mkl::sparse::matrix_handle_t A_handle,
-                 oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void* beta,
-                 oneapi::mkl::sparse::dense_vector_handle_t y_handle,
-                 oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::spmv_descr_t spmv_descr,
-                 const std::vector<sycl::event>& dependencies) {
+                 matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+                 const void* beta, dense_vector_handle_t y_handle, spmv_alg alg,
+                 spmv_descr_t spmv_descr, const std::vector<sycl::event>& dependencies) {
     bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
     bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
     check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible,
diff --git a/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx b/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx
index f73000340..dd2a4f627 100644
--- a/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx
+++ b/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx
@@ -17,45 +17,43 @@
+// In this file functions and types using the namespace oneapi::mkl::sparse:: refer to the backend's namespace for better readability.
 namespace oneapi::mkl::sparse {
 struct spsv_descr {
     bool buffer_size_called = false;
     bool optimized_called = false;
     oneapi::mkl::transpose last_optimized_opA;
-    oneapi::mkl::sparse::matrix_view last_optimized_A_view;
-    oneapi::mkl::sparse::matrix_handle_t last_optimized_A_handle;
-    oneapi::mkl::sparse::dense_vector_handle_t last_optimized_x_handle;
-    oneapi::mkl::sparse::dense_vector_handle_t last_optimized_y_handle;
-    oneapi::mkl::sparse::spsv_alg last_optimized_alg;
+    matrix_view last_optimized_A_view;
+    matrix_handle_t last_optimized_A_handle;
+    dense_vector_handle_t last_optimized_x_handle;
+    dense_vector_handle_t last_optimized_y_handle;
+    spsv_alg last_optimized_alg;
 } // namespace oneapi::mkl::sparse
 namespace oneapi::mkl::sparse::BACKEND {
-void init_spsv_descr(sycl::queue& /*queue*/, oneapi::mkl::sparse::spsv_descr_t* p_spsv_descr) {
+void init_spsv_descr(sycl::queue& /*queue*/, spsv_descr_t* p_spsv_descr) {
     *p_spsv_descr = new spsv_descr();
-sycl::event release_spsv_descr(sycl::queue& queue, oneapi::mkl::sparse::spsv_descr_t spsv_descr,
+sycl::event release_spsv_descr(sycl::queue& queue, spsv_descr_t spsv_descr,
                                const std::vector<sycl::event>& dependencies) {
     return detail::submit_release(queue, spsv_descr, dependencies);
 void check_valid_spsv(const std::string& function_name, oneapi::mkl::transpose opA,
-                      oneapi::mkl::sparse::matrix_view A_view,
-                      oneapi::mkl::sparse::matrix_handle_t A_handle,
-                      oneapi::mkl::sparse::dense_vector_handle_t x_handle,
-                      oneapi::mkl::sparse::dense_vector_handle_t y_handle,
-                      bool is_alpha_host_accessible, oneapi::mkl::sparse::spsv_alg alg) {
-    THROW_IF_NULLPTR(function_name, A_handle);
-    THROW_IF_NULLPTR(function_name, x_handle);
-    THROW_IF_NULLPTR(function_name, y_handle);
+                      matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+                      dense_vector_handle_t y_handle, bool is_alpha_host_accessible, spsv_alg alg) {
     auto internal_A_handle = detail::get_internal_handle(A_handle);
-    if (alg == oneapi::mkl::sparse::spsv_alg::no_optimize_alg &&
-        !internal_A_handle->has_matrix_property(oneapi::mkl::sparse::matrix_property::sorted)) {
+    detail::check_valid_spsv_common(function_name, A_view, internal_A_handle, x_handle, y_handle,
+                                    is_alpha_host_accessible);
+    if (alg == spsv_alg::no_optimize_alg &&
+        !internal_A_handle->has_matrix_property(matrix_property::sorted)) {
         throw mkl::unimplemented(
             "sparse_blas", function_name,
             "The backend does not support `no_optimize_alg` unless A_handle has the property `matrix_property::sorted`.");
@@ -72,25 +70,12 @@ void check_valid_spsv(const std::string& function_name, oneapi::mkl::transpose o
 #endif // BACKEND
-    detail::check_all_containers_compatible(function_name, internal_A_handle, x_handle, y_handle);
-    if (A_view.type_view != matrix_descr::triangular) {
-        throw mkl::invalid_argument("sparse_blas", function_name,
-                                    "Matrix view's type must be `matrix_descr::triangular`.");
-    }
-    if (internal_A_handle->all_use_buffer()) {
-        detail::check_ptr_is_host_accessible("spsv", "alpha", is_alpha_host_accessible);
-    }
 void spsv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
-                      oneapi::mkl::sparse::matrix_view A_view,
-                      oneapi::mkl::sparse::matrix_handle_t A_handle,
-                      oneapi::mkl::sparse::dense_vector_handle_t x_handle,
-                      oneapi::mkl::sparse::dense_vector_handle_t y_handle,
-                      oneapi::mkl::sparse::spsv_alg alg,
-                      oneapi::mkl::sparse::spsv_descr_t spsv_descr, std::size_t& temp_buffer_size) {
+                      matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+                      dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr,
+                      std::size_t& temp_buffer_size) {
     // TODO: Add support for external workspace once the close-source oneMKL backend supports it.
     bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
     check_valid_spsv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible,
@@ -100,12 +85,9 @@ void spsv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void
 inline void common_spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
-                                 oneapi::mkl::sparse::matrix_view A_view,
-                                 oneapi::mkl::sparse::matrix_handle_t A_handle,
-                                 oneapi::mkl::sparse::dense_vector_handle_t x_handle,
-                                 oneapi::mkl::sparse::dense_vector_handle_t y_handle,
-                                 oneapi::mkl::sparse::spsv_alg alg,
-                                 oneapi::mkl::sparse::spsv_descr_t spsv_descr) {
+                                 matrix_view A_view, matrix_handle_t A_handle,
+                                 dense_vector_handle_t x_handle, dense_vector_handle_t y_handle,
+                                 spsv_alg alg, spsv_descr_t spsv_descr) {
     bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
     check_valid_spsv("spsv_optimize", opA, A_view, A_handle, x_handle, y_handle,
                      is_alpha_host_accessible, alg);
@@ -123,18 +105,15 @@ inline void common_spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA,
 void spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
-                   oneapi::mkl::sparse::matrix_view A_view,
-                   oneapi::mkl::sparse::matrix_handle_t A_handle,
-                   oneapi::mkl::sparse::dense_vector_handle_t x_handle,
-                   oneapi::mkl::sparse::dense_vector_handle_t y_handle,
-                   oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::spsv_descr_t spsv_descr,
+                   matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+                   dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr,
                    sycl::buffer<std::uint8_t, 1> /*workspace*/) {
     auto internal_A_handle = detail::get_internal_handle(A_handle);
     if (!internal_A_handle->all_use_buffer()) {
     common_spsv_optimize(queue, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, spsv_descr);
-    if (alg == oneapi::mkl::sparse::spsv_alg::no_optimize_alg) {
+    if (alg == spsv_alg::no_optimize_alg) {
     internal_A_handle->can_be_reset = false;
@@ -143,19 +122,16 @@ void spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* a
 sycl::event spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
-                          oneapi::mkl::sparse::matrix_view A_view,
-                          oneapi::mkl::sparse::matrix_handle_t A_handle,
-                          oneapi::mkl::sparse::dense_vector_handle_t x_handle,
-                          oneapi::mkl::sparse::dense_vector_handle_t y_handle,
-                          oneapi::mkl::sparse::spsv_alg alg,
-                          oneapi::mkl::sparse::spsv_descr_t spsv_descr, void* /*workspace*/,
+                          matrix_view A_view, matrix_handle_t A_handle,
+                          dense_vector_handle_t x_handle, dense_vector_handle_t y_handle,
+                          spsv_alg alg, spsv_descr_t spsv_descr, void* /*workspace*/,
                           const std::vector<sycl::event>& dependencies) {
     auto internal_A_handle = detail::get_internal_handle(A_handle);
     if (internal_A_handle->all_use_buffer()) {
     common_spsv_optimize(queue, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, spsv_descr);
-    if (alg == oneapi::mkl::sparse::spsv_alg::no_optimize_alg) {
+    if (alg == spsv_alg::no_optimize_alg) {
         return detail::collapse_dependencies(queue, dependencies);
     internal_A_handle->can_be_reset = false;
@@ -165,12 +141,9 @@ sycl::event spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const
 template <typename T>
 sycl::event internal_spsv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
-                          oneapi::mkl::sparse::matrix_view A_view,
-                          oneapi::mkl::sparse::matrix_handle_t A_handle,
-                          oneapi::mkl::sparse::dense_vector_handle_t x_handle,
-                          oneapi::mkl::sparse::dense_vector_handle_t y_handle,
-                          oneapi::mkl::sparse::spsv_alg /*alg*/,
-                          oneapi::mkl::sparse::spsv_descr_t /*spsv_descr*/,
+                          matrix_view A_view, matrix_handle_t A_handle,
+                          dense_vector_handle_t x_handle, dense_vector_handle_t y_handle,
+                          spsv_alg /*alg*/, spsv_descr_t /*spsv_descr*/,
                           const std::vector<sycl::event>& dependencies,
                           bool is_alpha_host_accessible) {
     T host_alpha =
@@ -193,11 +166,8 @@ sycl::event internal_spsv(sycl::queue& queue, oneapi::mkl::transpose opA, const
 sycl::event spsv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha,
-                 oneapi::mkl::sparse::matrix_view A_view,
-                 oneapi::mkl::sparse::matrix_handle_t A_handle,
-                 oneapi::mkl::sparse::dense_vector_handle_t x_handle,
-                 oneapi::mkl::sparse::dense_vector_handle_t y_handle,
-                 oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::spsv_descr_t spsv_descr,
+                 matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle,
+                 dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr,
                  const std::vector<sycl::event>& dependencies) {
     bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
     check_valid_spsv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible,
diff --git a/src/sparse_blas/backends/mklcpu/mklcpu_handles.cpp b/src/sparse_blas/backends/mklcpu/mklcpu_handles.cpp
index a6ea51629..0aaf91b25 100644
--- a/src/sparse_blas/backends/mklcpu/mklcpu_handles.cpp
+++ b/src/sparse_blas/backends/mklcpu/mklcpu_handles.cpp
@@ -19,7 +19,7 @@
 #include "oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp"
-#include "sparse_blas/backends/mkl_common/mkl_helper.hpp"
+#include "sparse_blas/backends/mkl_common/mkl_dispatch.hpp"
 #include "sparse_blas/backends/mkl_common/mkl_handles.hpp"
 namespace oneapi::mkl::sparse::mklcpu {
diff --git a/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp b/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp
index 0929a7ef4..ebc8ceecf 100644
--- a/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp
+++ b/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp
@@ -17,10 +17,12 @@
+#include "sparse_blas/backends/mkl_common/mkl_dispatch.hpp"
 #include "sparse_blas/backends/mkl_common/mkl_handles.hpp"
-#include "sparse_blas/backends/mkl_common/mkl_helper.hpp"
+#include "sparse_blas/common_op_verification.hpp"
 #include "sparse_blas/macros.hpp"
 #include "sparse_blas/matrix_view_comparison.hpp"
+#include "sparse_blas/sycl_helper.hpp"
 #include "oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp"
diff --git a/src/sparse_blas/backends/mklgpu/mklgpu_handles.cpp b/src/sparse_blas/backends/mklgpu/mklgpu_handles.cpp
index 7cb9853a7..648fed66e 100644
--- a/src/sparse_blas/backends/mklgpu/mklgpu_handles.cpp
+++ b/src/sparse_blas/backends/mklgpu/mklgpu_handles.cpp
@@ -19,8 +19,8 @@
 #include "oneapi/mkl/sparse_blas/detail/mklgpu/onemkl_sparse_blas_mklgpu.hpp"
+#include "sparse_blas/backends/mkl_common/mkl_dispatch.hpp"
 #include "sparse_blas/backends/mkl_common/mkl_handles.hpp"
-#include "sparse_blas/backends/mkl_common/mkl_helper.hpp"
 namespace oneapi::mkl::sparse::mklgpu {
diff --git a/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp b/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp
index be5e0c0aa..1102306dc 100644
--- a/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp
+++ b/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp
@@ -17,10 +17,12 @@
+#include "sparse_blas/backends/mkl_common/mkl_dispatch.hpp"
 #include "sparse_blas/backends/mkl_common/mkl_handles.hpp"
-#include "sparse_blas/backends/mkl_common/mkl_helper.hpp"
+#include "sparse_blas/common_op_verification.hpp"
 #include "sparse_blas/macros.hpp"
 #include "sparse_blas/matrix_view_comparison.hpp"
+#include "sparse_blas/sycl_helper.hpp"
 #include "oneapi/mkl/sparse_blas/detail/mklgpu/onemkl_sparse_blas_mklgpu.hpp"
diff --git a/src/sparse_blas/common_op_verification.hpp b/src/sparse_blas/common_op_verification.hpp
new file mode 100644
index 000000000..318766fb4
--- /dev/null
+++ b/src/sparse_blas/common_op_verification.hpp
@@ -0,0 +1,137 @@
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+#include <string>
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#include <CL/sycl.hpp>
+#include "oneapi/mkl/sparse_blas/types.hpp"
+#include "macros.hpp"
+namespace oneapi::mkl::sparse::detail {
+/// Throw an exception if the scalar is not accessible in the host
+inline void check_ptr_is_host_accessible(const std::string& function_name,
+                                         const std::string& scalar_name,
+                                         bool is_ptr_accessible_on_host) {
+    if (!is_ptr_accessible_on_host) {
+        throw mkl::invalid_argument(
+            "sparse_blas", function_name,
+            "Scalar " + scalar_name + " must be accessible on the host for buffer functions.");
+    }
+template <typename InternalSparseMatHandleT>
+void check_valid_spmm_common(const std::string& function_name, matrix_view A_view,
+                             InternalSparseMatHandleT internal_A_handle,
+                             dense_matrix_handle_t B_handle, dense_matrix_handle_t C_handle,
+                             bool is_alpha_host_accessible, bool is_beta_host_accessible) {
+    THROW_IF_NULLPTR(function_name, internal_A_handle);
+    THROW_IF_NULLPTR(function_name, B_handle);
+    THROW_IF_NULLPTR(function_name, C_handle);
+    check_all_containers_compatible(function_name, internal_A_handle, B_handle, C_handle);
+    if (internal_A_handle->all_use_buffer()) {
+        check_ptr_is_host_accessible("spmm", "alpha", is_alpha_host_accessible);
+        check_ptr_is_host_accessible("spmm", "beta", is_beta_host_accessible);
+    }
+    if (is_alpha_host_accessible != is_beta_host_accessible) {
+        throw mkl::invalid_argument(
+            "sparse_blas", function_name,
+            "Alpha and beta must both be placed on host memory or device memory.");
+    }
+    if (B_handle->dense_layout != C_handle->dense_layout) {
+        throw mkl::invalid_argument("sparse_blas", function_name,
+                                    "B and C matrices must use the same layout.");
+    }
+    if (A_view.type_view != matrix_descr::general) {
+        throw mkl::invalid_argument("sparse_blas", function_name,
+                                    "Matrix view's `type_view` must be `matrix_descr::general`.");
+    }
+    if (A_view.diag_view != oneapi::mkl::diag::nonunit) {
+        throw mkl::invalid_argument("sparse_blas", function_name,
+                                    "Matrix's diag_view must be `nonunit`.");
+    }
+template <typename InternalSparseMatHandleT>
+void check_valid_spmv_common(const std::string& function_name, oneapi::mkl::transpose /*opA*/,
+                             matrix_view A_view, InternalSparseMatHandleT internal_A_handle,
+                             dense_vector_handle_t x_handle, dense_vector_handle_t y_handle,
+                             bool is_alpha_host_accessible, bool is_beta_host_accessible) {
+    THROW_IF_NULLPTR(function_name, internal_A_handle);
+    THROW_IF_NULLPTR(function_name, x_handle);
+    THROW_IF_NULLPTR(function_name, y_handle);
+    check_all_containers_compatible(function_name, internal_A_handle, x_handle, y_handle);
+    if (internal_A_handle->all_use_buffer()) {
+        check_ptr_is_host_accessible("spmv", "alpha", is_alpha_host_accessible);
+        check_ptr_is_host_accessible("spmv", "beta", is_beta_host_accessible);
+    }
+    if (is_alpha_host_accessible != is_beta_host_accessible) {
+        throw mkl::invalid_argument(
+            "sparse_blas", function_name,
+            "Alpha and beta must both be placed on host memory or device memory.");
+    }
+    if (A_view.type_view == matrix_descr::diagonal) {
+        throw mkl::invalid_argument("sparse_blas", function_name,
+                                    "Matrix view's `type_view` cannot be diagonal.");
+    }
+    if (A_view.type_view != matrix_descr::triangular &&
+        A_view.diag_view == oneapi::mkl::diag::unit) {
+        throw mkl::invalid_argument(
+            "sparse_blas", function_name,
+            "`diag_view::unit` can only be used with `type_view::triangular`.");
+    }
+template <typename InternalSparseMatHandleT>
+void check_valid_spsv_common(const std::string& function_name, matrix_view A_view,
+                             InternalSparseMatHandleT internal_A_handle,
+                             dense_vector_handle_t x_handle, dense_vector_handle_t y_handle,
+                             bool is_alpha_host_accessible) {
+    THROW_IF_NULLPTR(function_name, internal_A_handle);
+    THROW_IF_NULLPTR(function_name, x_handle);
+    THROW_IF_NULLPTR(function_name, y_handle);
+    check_all_containers_compatible(function_name, internal_A_handle, x_handle, y_handle);
+    if (A_view.type_view != matrix_descr::triangular) {
+        throw mkl::invalid_argument(
+            "sparse_blas", function_name,
+            "Matrix view's `type_view` must be `matrix_descr::triangular`.");
+    }
+    if (internal_A_handle->all_use_buffer()) {
+        check_ptr_is_host_accessible("spsv", "alpha", is_alpha_host_accessible);
+    }
+} // namespace oneapi::mkl::sparse::detail
\ No newline at end of file
diff --git a/src/sparse_blas/generic_container.hpp b/src/sparse_blas/generic_container.hpp
index 53bd50837..c2e8476a7 100644
--- a/src/sparse_blas/generic_container.hpp
+++ b/src/sparse_blas/generic_container.hpp
@@ -39,10 +39,12 @@ struct generic_container {
     // USM pointer, nullptr if the provided data is a buffer.
     void* usm_ptr;
-    // Buffer pointer, nullptr if the provided data is a USM pointer.
-    // The buffer is needed to properly handle the dependencies when the handle is used.
-    // Use a void* type for the buffer to avoid using template arguments in every function using data handles.
-    // Using reinterpret does not solve the issue as the returned buffer needs the type of the original buffer for the aligned_allocator.
+    // Buffer pointer, nullptr if the provided data is a USM pointer. The buffer
+    // is needed to properly handle the dependencies when the handle is used.
+    // Use a void* type for the buffer to avoid using template arguments in
+    // every function using data handles. Using `sycl::buffer::reinterpret` does
+    // not solve the issue as the returned buffer needs the type of the original
+    // buffer for the aligned_allocator.
     std::shared_ptr<void> buffer_ptr;
     // Underlying USM or buffer data type
@@ -61,6 +63,10 @@ struct generic_container {
               buffer_ptr(std::make_shared<sycl::buffer<T, 1>>(buffer)),
               data_type(get_data_type<T>()) {}
+    bool use_buffer() const {
+        return static_cast<bool>(buffer_ptr);
+    }
     template <typename T>
     void set_usm_ptr(T* ptr) {
         usm_ptr = ptr;
@@ -108,7 +114,7 @@ struct generic_dense_handle {
               value_container(value_buffer) {}
     bool all_use_buffer() const {
-        return static_cast<bool>(value_container.buffer_ptr);
+        return value_container.use_buffer();
     data_type get_value_type() const {
@@ -201,6 +207,8 @@ struct generic_dense_matrix_handle : public detail::generic_dense_handle<Backend
+enum class sparse_format { CSR, COO };
 /// Generic sparse_matrix_handle used by all backends
 template <typename BackendHandleT>
 struct generic_sparse_handle {
@@ -210,34 +218,51 @@ struct generic_sparse_handle {
     generic_container col_container;
     generic_container value_container;
+    sparse_format format;
+    std::int64_t num_rows;
+    std::int64_t num_cols;
+    std::int64_t nnz;
+    oneapi::mkl::index_base index;
     std::int32_t properties_mask;
     bool can_be_reset;
     template <typename fpType, typename intType>
     generic_sparse_handle(BackendHandleT backend_handle, intType* row_ptr, intType* col_ptr,
-                          fpType* value_ptr)
+                          fpType* value_ptr, sparse_format format, std::int64_t num_rows,
+                          std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index)
             : backend_handle(backend_handle),
+              format(format),
+              num_rows(num_rows),
+              num_cols(num_cols),
+              nnz(nnz),
+              index(index),
               can_be_reset(true) {}
     template <typename fpType, typename intType>
     generic_sparse_handle(BackendHandleT backend_handle, const sycl::buffer<intType, 1> row_buffer,
                           const sycl::buffer<intType, 1> col_buffer,
-                          const sycl::buffer<fpType, 1> value_buffer)
+                          const sycl::buffer<fpType, 1> value_buffer, sparse_format format,
+                          std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,
+                          oneapi::mkl::index_base index)
             : backend_handle(backend_handle),
+              format(format),
+              num_rows(num_rows),
+              num_cols(num_cols),
+              nnz(nnz),
+              index(index),
               can_be_reset(true) {}
     bool all_use_buffer() const {
-        return static_cast<bool>(value_container.buffer_ptr) &&
-               static_cast<bool>(row_container.buffer_ptr) &&
-               static_cast<bool>(col_container.buffer_ptr);
+        return value_container.use_buffer() && row_container.use_buffer() &&
+               col_container.use_buffer();
     data_type get_value_type() const {
@@ -248,19 +273,20 @@ struct generic_sparse_handle {
         return row_container.data_type;
-    void set_matrix_property(oneapi::mkl::sparse::matrix_property property) {
+    void set_matrix_property(matrix_property property) {
         properties_mask |= matrix_property_to_mask(property);
-    bool has_matrix_property(oneapi::mkl::sparse::matrix_property property) {
+    bool has_matrix_property(matrix_property property) {
         return properties_mask & matrix_property_to_mask(property);
-    std::int32_t matrix_property_to_mask(oneapi::mkl::sparse::matrix_property property) {
+    std::int32_t matrix_property_to_mask(matrix_property property) {
         switch (property) {
-            case oneapi::mkl::sparse::matrix_property::symmetric: return 1 << 0;
-            case oneapi::mkl::sparse::matrix_property::sorted: return 1 << 1;
+            case matrix_property::symmetric: return 1 << 0;
+            case matrix_property::sorted: return 1 << 1;
+            case matrix_property::sorted_by_rows: return 1 << 2;
                 throw oneapi::mkl::invalid_argument(
                     "sparse_blas", "set_matrix_property",
@@ -321,12 +347,38 @@ void check_all_containers_compatible(const std::string& function_name,
-template <typename T, typename DependenciesT>
-sycl::event submit_release(sycl::queue& queue, T* ptr, const DependenciesT& dependencies) {
-    return queue.submit([&](sycl::handler& cgh) {
-        cgh.depends_on(dependencies);
-        cgh.host_task([=]() { delete ptr; });
-    });
+template <typename fpType, typename InternalHandleT>
+void check_can_reset_value_handle(const std::string& function_name,
+                                  InternalHandleT* internal_handle, bool expect_buffer) {
+    if (internal_handle->get_value_type() != detail::get_data_type<fpType>()) {
+        throw oneapi::mkl::invalid_argument(
+            "sparse_blas", function_name,
+            "Incompatible data types expected " +
+                data_type_to_str(internal_handle->get_value_type()) + " but got " +
+                data_type_to_str(detail::get_data_type<fpType>()));
+    }
+    if (internal_handle->all_use_buffer() != expect_buffer) {
+        throw oneapi::mkl::invalid_argument(
+            "sparse_blas", function_name, "Cannot change the container type between buffer or USM");
+    }
+template <typename fpType, typename intType, typename InternalHandleT>
+void check_can_reset_sparse_handle(const std::string& function_name,
+                                   InternalHandleT* internal_smhandle, bool expect_buffer) {
+    check_can_reset_value_handle<fpType>(function_name, internal_smhandle, expect_buffer);
+    if (internal_smhandle->get_int_type() != detail::get_data_type<intType>()) {
+        throw oneapi::mkl::invalid_argument(
+            "sparse_blas", function_name,
+            "Incompatible data types expected " +
+                data_type_to_str(internal_smhandle->get_int_type()) + " but got " +
+                data_type_to_str(detail::get_data_type<intType>()));
+    }
+    if (!internal_smhandle->can_be_reset) {
+        throw mkl::unimplemented(
+            "sparse_blas", function_name,
+            "The backend does not support reseting the matrix handle's data after it was used in a computation.");
+    }
 } // namespace oneapi::mkl::sparse::detail
diff --git a/src/sparse_blas/macros.hpp b/src/sparse_blas/macros.hpp
index 7eba01390..72aa39a75 100644
--- a/src/sparse_blas/macros.hpp
+++ b/src/sparse_blas/macros.hpp
@@ -36,10 +36,91 @@
     FOR_EACH_FP_AND_INT_TYPE_HELPER(DEFINE_MACRO, std::int32_t, _i32); \
+#define INSTANTIATE_DENSE_VECTOR_FUNCS(FP_TYPE, FP_SUFFIX)                            \
+    template void init_dense_vector<FP_TYPE>(                                         \
+        sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t * p_dvhandle, \
+        std::int64_t size, sycl::buffer<FP_TYPE, 1> val);                             \
+    template void init_dense_vector<FP_TYPE>(                                         \
+        sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t * p_dvhandle, \
+        std::int64_t size, FP_TYPE* val);                                             \
+    template void set_dense_vector_data<FP_TYPE>(                                     \
+        sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle,     \
+        std::int64_t size, sycl::buffer<FP_TYPE, 1> val);                             \
+    template void set_dense_vector_data<FP_TYPE>(                                     \
+        sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle,     \
+        std::int64_t size, FP_TYPE* val)
+#define INSTANTIATE_DENSE_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX)                            \
+    template void init_dense_matrix<FP_TYPE>(                                         \
+        sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t * p_dmhandle, \
+        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,                \
+        oneapi::mkl::layout dense_layout, sycl::buffer<FP_TYPE, 1> val);              \
+    template void init_dense_matrix<FP_TYPE>(                                         \
+        sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t * p_dmhandle, \
+        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,                \
+        oneapi::mkl::layout dense_layout, FP_TYPE* val);                              \
+    template void set_dense_matrix_data<FP_TYPE>(                                     \
+        sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle,     \
+        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,                \
+        oneapi::mkl::layout dense_layout, sycl::buffer<FP_TYPE, 1> val);              \
+    template void set_dense_matrix_data<FP_TYPE>(                                     \
+        sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle,     \
+        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,                \
+        oneapi::mkl::layout dense_layout, FP_TYPE* val)
+    template void init_coo_matrix<FP_TYPE, INT_TYPE>(                                              \
+        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle,                    \
+        std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,                            \
+        oneapi::mkl::index_base index, sycl::buffer<INT_TYPE, 1> row_ind,                          \
+        sycl::buffer<INT_TYPE, 1> col_ind, sycl::buffer<FP_TYPE, 1> val);                          \
+    template void init_coo_matrix<FP_TYPE, INT_TYPE>(                                              \
+        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle,                    \
+        std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,                            \
+        oneapi::mkl::index_base index, INT_TYPE* row_ind, INT_TYPE* col_ind, FP_TYPE* val);        \
+    template void set_coo_matrix_data<FP_TYPE, INT_TYPE>(                                          \
+        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \
+        std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,                    \
+        sycl::buffer<INT_TYPE, 1> row_ind, sycl::buffer<INT_TYPE, 1> col_ind,                      \
+        sycl::buffer<FP_TYPE, 1> val);                                                             \
+    template void set_coo_matrix_data<FP_TYPE, INT_TYPE>(                                          \
+        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \
+        std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, INT_TYPE* row_ind, \
+        INT_TYPE* col_ind, FP_TYPE* val)
+    template void init_csr_matrix<FP_TYPE, INT_TYPE>(                                              \
+        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle,                    \
+        std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,                            \
+        oneapi::mkl::index_base index, sycl::buffer<INT_TYPE, 1> row_ptr,                          \
+        sycl::buffer<INT_TYPE, 1> col_ind, sycl::buffer<FP_TYPE, 1> val);                          \
+    template void init_csr_matrix<FP_TYPE, INT_TYPE>(                                              \
+        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle,                    \
+        std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,                            \
+        oneapi::mkl::index_base index, INT_TYPE* row_ptr, INT_TYPE* col_ind, FP_TYPE* val);        \
+    template void set_csr_matrix_data<FP_TYPE, INT_TYPE>(                                          \
+        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \
+        std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,                    \
+        sycl::buffer<INT_TYPE, 1> row_ptr, sycl::buffer<INT_TYPE, 1> col_ind,                      \
+        sycl::buffer<FP_TYPE, 1> val);                                                             \
+    template void set_csr_matrix_data<FP_TYPE, INT_TYPE>(                                          \
+        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \
+        std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, INT_TYPE* row_ptr, \
+        INT_TYPE* col_ind, FP_TYPE* val)
 #define THROW_IF_NULLPTR(FUNC_NAME, PTR)                                       \
     if (!(PTR)) {                                                              \
         throw mkl::uninitialized("sparse_blas", FUNC_NAME,                     \
                                  std::string(#PTR) + " must not be nullptr."); \
+#define CHECK_DESCR_MATCH(descr, argument, optimize_func_name)                                    \
+    do {                                                                                          \
+        if (descr->last_optimized_##argument != argument) {                                       \
+            throw mkl::invalid_argument(                                                          \
+                "sparse_blas", __func__,                                                          \
+                #argument " argument must match with the previous call to " #optimize_func_name); \
+        }                                                                                         \
+    } while (0)
diff --git a/src/sparse_blas/sycl_helper.hpp b/src/sparse_blas/sycl_helper.hpp
new file mode 100644
index 000000000..1a055b405
--- /dev/null
+++ b/src/sparse_blas/sycl_helper.hpp
@@ -0,0 +1,80 @@
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#include <CL/sycl.hpp>
+namespace oneapi::mkl::sparse::detail {
+/// Return whether a pointer is accessible on the host
+template <typename T>
+inline bool is_ptr_accessible_on_host(sycl::queue queue, const T* host_or_device_ptr) {
+    auto alloc_type = sycl::get_pointer_type(host_or_device_ptr, queue.get_context());
+    return alloc_type == sycl::usm::alloc::host || alloc_type == sycl::usm::alloc::shared ||
+           alloc_type == sycl::usm::alloc::unknown;
+/// Return a scalar on the host from a pointer to host or device memory
+template <typename T>
+inline T get_scalar_on_host(sycl::queue& queue, const T* host_or_device_ptr,
+                            bool is_ptr_accessible_on_host) {
+    if (is_ptr_accessible_on_host) {
+        return *host_or_device_ptr;
+    }
+    T scalar;
+    auto event = queue.copy(host_or_device_ptr, &scalar, 1);
+    event.wait_and_throw();
+    return scalar;
+/// Submit the release of \p ptr in a host_task waiting on the dependencies
+template <typename T>
+sycl::event submit_release(sycl::queue& queue, T* ptr,
+                           const std::vector<sycl::event>& dependencies) {
+    return queue.submit([&](sycl::handler& cgh) {
+        cgh.depends_on(dependencies);
+        cgh.host_task([=]() { delete ptr; });
+    });
+/// Merge multiple event dependencies into one
+inline sycl::event collapse_dependencies(sycl::queue& queue,
+                                         const std::vector<sycl::event>& dependencies) {
+    if (dependencies.empty()) {
+        return {};
+    }
+    else if (dependencies.size() == 1) {
+        return dependencies[0];
+    }
+    return queue.submit([&](sycl::handler& cgh) {
+        cgh.depends_on(dependencies);
+        cgh.host_task([=]() {});
+    });
+} // namespace oneapi::mkl::sparse::detail
diff --git a/tests/unit_tests/CMakeLists.txt b/tests/unit_tests/CMakeLists.txt
index 235f8c8e5..d250a03a0 100644
--- a/tests/unit_tests/CMakeLists.txt
+++ b/tests/unit_tests/CMakeLists.txt
@@ -192,6 +192,11 @@ foreach(domain ${TEST_TARGET_DOMAINS})
     list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_dft_portfft)
+  if(domain STREQUAL "sparse_blas" AND ENABLE_CUSPARSE_BACKEND)
+    add_dependencies(test_main_${domain}_ct onemkl_${domain}_cusparse)
+    list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_cusparse)
+  endif()
   target_link_libraries(test_main_${domain}_ct PUBLIC
diff --git a/tests/unit_tests/include/test_helper.hpp b/tests/unit_tests/include/test_helper.hpp
index c6eaf3421..cb27c9098 100644
--- a/tests/unit_tests/include/test_helper.hpp
+++ b/tests/unit_tests/include/test_helper.hpp
@@ -176,6 +176,13 @@
 #define TEST_RUN_PORTFFT_SELECT(q, func, ...)
+    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::cusparse>{ q }, __VA_ARGS__)
 #ifndef __HIPSYCL__
 #define CHECK_HOST_OR_CPU(q) q.get_device().is_cpu()
@@ -268,6 +275,9 @@
             if (vendor_id == INTEL_ID) {                                   \
                 TEST_RUN_INTELGPU_SELECT(q, func, __VA_ARGS__);            \
             }                                                              \
+            else if (vendor_id == NVIDIA_ID) {                             \
+                TEST_RUN_NVIDIAGPU_CUSPARSE_SELECT(q, func, __VA_ARGS__);  \
+            }                                                              \
         }                                                                  \
     } while (0);
diff --git a/tests/unit_tests/main_test.cpp b/tests/unit_tests/main_test.cpp
index 7e2ad079a..fa7dffcc6 100644
--- a/tests/unit_tests/main_test.cpp
+++ b/tests/unit_tests/main_test.cpp
@@ -126,7 +126,8 @@ int main(int argc, char** argv) {
     !defined(ONEMKL_ENABLE_CUSOLVER_BACKEND) &&                                         \
     !defined(ONEMKL_ENABLE_PORTBLAS_BACKEND_NVIDIA_GPU) &&                              \
                         if (dev.is_gpu() && vendor_id == NVIDIA_ID)
diff --git a/tests/unit_tests/sparse_blas/include/test_common.hpp b/tests/unit_tests/sparse_blas/include/test_common.hpp
index 628f55e2e..6637e0daa 100644
--- a/tests/unit_tests/sparse_blas/include/test_common.hpp
+++ b/tests/unit_tests/sparse_blas/include/test_common.hpp
@@ -59,12 +59,39 @@ enum sparse_matrix_format_t {
-static std::vector<std::set<oneapi::mkl::sparse::matrix_property>> test_matrix_properties{
-    { oneapi::mkl::sparse::matrix_property::sorted },
-    { oneapi::mkl::sparse::matrix_property::symmetric },
-    { oneapi::mkl::sparse::matrix_property::sorted,
-      oneapi::mkl::sparse::matrix_property::symmetric }
+inline std::set<oneapi::mkl::sparse::matrix_property> get_default_matrix_properties(
+    sycl::queue queue, sparse_matrix_format_t format) {
+    auto vendor_id = oneapi::mkl::get_device_id(queue);
+    if (vendor_id == oneapi::mkl::device::nvidiagpu && format == sparse_matrix_format_t::COO) {
+        return { oneapi::mkl::sparse::matrix_property::sorted_by_rows };
+    }
+    return {};
+/// Return the combinations of matrix_properties to test other than the default
+inline std::vector<std::set<oneapi::mkl::sparse::matrix_property>>
+get_all_matrix_properties_combinations(sycl::queue queue, sparse_matrix_format_t format) {
+    auto vendor_id = oneapi::mkl::get_device_id(queue);
+    if (vendor_id == oneapi::mkl::device::nvidiagpu && format == sparse_matrix_format_t::COO) {
+        // Ensure all the sets have the sorted or sorted_by_rows properties
+        return { { oneapi::mkl::sparse::matrix_property::sorted },
+                 { oneapi::mkl::sparse::matrix_property::sorted_by_rows,
+                   oneapi::mkl::sparse::matrix_property::symmetric },
+                 { oneapi::mkl::sparse::matrix_property::sorted,
+                   oneapi::mkl::sparse::matrix_property::symmetric } };
+    }
+    std::vector<std::set<oneapi::mkl::sparse::matrix_property>> properties_combinations{
+        { oneapi::mkl::sparse::matrix_property::sorted },
+        { oneapi::mkl::sparse::matrix_property::symmetric },
+        { oneapi::mkl::sparse::matrix_property::sorted,
+          oneapi::mkl::sparse::matrix_property::symmetric }
+    };
+    if (format == sparse_matrix_format_t::COO) {
+        properties_combinations.push_back({ oneapi::mkl::sparse::matrix_property::sorted_by_rows });
+    }
+    return properties_combinations;
 void print_error_code(sycl::exception const& e);
@@ -207,9 +234,9 @@ template <typename fpType>
 fpType generate_data(bool is_diag) {
     rand_scalar<fpType> rand_data;
     if (is_diag) {
-        // Guarantee an amplitude >= 0.1
+        // Guarantee a large amplitude
         fpType sign = (std::rand() % 2) * 2 - 1;
-        return rand_data(0.1, 0.5) * sign;
+        return rand_data(10, 20) * sign;
     return rand_data(-0.5, 0.5);
@@ -337,8 +364,18 @@ intType generate_random_matrix(sparse_matrix_format_t format, const intType nrow
 /// In CSR format, the elements within a row are shuffled without changing ia.
 /// In COO format, all the elements are shuffled.
 template <typename fpType, typename intType>
-void shuffle_sparse_matrix(sparse_matrix_format_t format, intType indexing, intType* ia,
-                           intType* ja, fpType* a, intType nnz, std::size_t nrows) {
+void shuffle_sparse_matrix_if_needed(
+    sparse_matrix_format_t format,
+    const std::set<oneapi::mkl::sparse::matrix_property>& matrix_properties, intType indexing,
+    intType* ia, intType* ja, fpType* a, intType nnz, std::size_t nrows) {
+    const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) !=
+                           matrix_properties.cend();
+    if (is_sorted) {
+        return;
+    }
+    const bool is_sorted_by_rows =
+        matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted_by_rows) !=
+        matrix_properties.cend();
     if (format == sparse_matrix_format_t::CSR) {
         for (std::size_t i = 0; i < nrows; ++i) {
             intType nnz_row = ia[i + 1] - ia[i];
@@ -349,18 +386,40 @@ void shuffle_sparse_matrix(sparse_matrix_format_t format, intType indexing, intT
                 std::swap(a[q], a[j]);
+        // sorted_by_rows does not impact CSR
     else if (format == sparse_matrix_format_t::COO) {
-        for (std::size_t i = 0; i < static_cast<std::size_t>(nnz); ++i) {
-            intType q = std::rand() % nnz;
-            // Swap elements i and q
-            std::swap(ia[q], ia[i]);
-            std::swap(ja[q], ja[i]);
-            std::swap(a[q], a[i]);
+        if (is_sorted_by_rows) {
+            std::size_t linear_idx = 0;
+            for (std::size_t i = 0; i < nrows; ++i) {
+                // Count the number of non-zero elements for the given row
+                std::size_t nnz_row = 1;
+                while (linear_idx + nnz_row < static_cast<std::size_t>(nnz) &&
+                       ia[linear_idx] == ia[linear_idx + nnz_row]) {
+                    ++nnz_row;
+                }
+                for (std::size_t j = 0; j < nnz_row; ++j) {
+                    // Swap elements within the same row
+                    std::size_t q = linear_idx + (static_cast<std::size_t>(std::rand()) % nnz_row);
+                    // Swap elements j and q
+                    std::swap(ja[q], ja[linear_idx + j]);
+                    std::swap(a[q], a[linear_idx + j]);
+                }
+                linear_idx += nnz_row;
+            }
+        }
+        else {
+            for (std::size_t i = 0; i < static_cast<std::size_t>(nnz); ++i) {
+                intType q = std::rand() % nnz;
+                // Swap elements i and q
+                std::swap(ia[q], ia[i]);
+                std::swap(ja[q], ja[i]);
+                std::swap(a[q], a[i]);
+            }
     else {
-        throw oneapi::mkl::exception("sparse_blas", "shuffle_sparse_matrix",
+        throw oneapi::mkl::exception("sparse_blas", "shuffle_sparse_matrix_if_needed",
                                      "Internal error: unsupported format");
diff --git a/tests/unit_tests/sparse_blas/include/test_spmm.hpp b/tests/unit_tests/sparse_blas/include/test_spmm.hpp
index 17874cd63..153862f53 100644
--- a/tests/unit_tests/sparse_blas/include/test_spmm.hpp
+++ b/tests/unit_tests/sparse_blas/include/test_spmm.hpp
@@ -58,6 +58,7 @@ void test_helper_with_format_with_transpose(
     const std::vector<oneapi::mkl::sparse::spmm_alg>& non_default_algorithms,
     oneapi::mkl::transpose transpose_A, oneapi::mkl::transpose transpose_B, int& num_passed,
     int& num_skipped) {
+    sycl::property_list queue_properties;
     double density_A_matrix = 0.8;
     fpType fp_zero = set_fp_value<fpType>()(0.f, 0.f);
     fpType fp_one = set_fp_value<fpType>()(1.f, 0.f);
@@ -65,10 +66,13 @@ void test_helper_with_format_with_transpose(
     oneapi::mkl::layout col_major = oneapi::mkl::layout::col_major;
     oneapi::mkl::sparse::spmm_alg default_alg = oneapi::mkl::sparse::spmm_alg::default_alg;
     oneapi::mkl::sparse::matrix_view default_A_view;
-    std::set<oneapi::mkl::sparse::matrix_property> no_properties;
     bool no_reset_data = false;
     bool no_scalars_on_device = false;
+    // Queue is only used to get which matrix_property should be used for the tests.
+    sycl::queue properties_queue(*dev);
+    auto default_properties = get_default_matrix_properties(properties_queue, format);
         int m = 4, k = 6, n = 5;
         int nrows_A = (transpose_A != oneapi::mkl::transpose::nontrans) ? k : m;
@@ -82,107 +86,119 @@ void test_helper_with_format_with_transpose(
         // Basic test
-            test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero,
-                             col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc,
-                             default_alg, default_A_view, no_properties, no_reset_data,
-                             no_scalars_on_device),
+            test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C,
+                             density_A_matrix, index_zero, col_major, transpose_A, transpose_B,
+                             fp_one, fp_zero, ldb, ldc, default_alg, default_A_view,
+                             default_properties, no_reset_data, no_scalars_on_device),
             num_passed, num_skipped);
         // Reset data
-            test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero,
-                             col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc,
-                             default_alg, default_A_view, no_properties, true,
-                             no_scalars_on_device),
+            test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C,
+                             density_A_matrix, index_zero, col_major, transpose_A, transpose_B,
+                             fp_one, fp_zero, ldb, ldc, default_alg, default_A_view,
+                             default_properties, true, no_scalars_on_device),
             num_passed, num_skipped);
         // Test alpha and beta on the device
-            test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero,
-                             col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc,
-                             default_alg, default_A_view, no_properties, no_reset_data, true),
+            test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C,
+                             density_A_matrix, index_zero, col_major, transpose_A, transpose_B,
+                             fp_one, fp_zero, ldb, ldc, default_alg, default_A_view,
+                             default_properties, no_reset_data, true),
             num_passed, num_skipped);
         // Test index_base 1
-            test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix,
-                             oneapi::mkl::index_base::one, col_major, transpose_A, transpose_B,
-                             fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, no_properties,
-                             no_reset_data, no_scalars_on_device),
+            test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C,
+                             density_A_matrix, oneapi::mkl::index_base::one, col_major, transpose_A,
+                             transpose_B, fp_one, fp_zero, ldb, ldc, default_alg, default_A_view,
+                             default_properties, no_reset_data, no_scalars_on_device),
             num_passed, num_skipped);
         // Test non-default alpha
-            test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero,
-                             col_major, transpose_A, transpose_B, set_fp_value<fpType>()(2.f, 1.5f),
-                             fp_zero, ldb, ldc, default_alg, default_A_view, no_properties,
-                             no_reset_data, no_scalars_on_device),
+            test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C,
+                             density_A_matrix, index_zero, col_major, transpose_A, transpose_B,
+                             set_fp_value<fpType>()(2.f, 1.5f), fp_zero, ldb, ldc, default_alg,
+                             default_A_view, default_properties, no_reset_data,
+                             no_scalars_on_device),
             num_passed, num_skipped);
         // Test non-default beta
-            test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero,
-                             col_major, transpose_A, transpose_B, fp_one,
-                             set_fp_value<fpType>()(3.2f, 1.f), ldb, ldc, default_alg,
-                             default_A_view, no_properties, no_reset_data, no_scalars_on_device),
+            test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C,
+                             density_A_matrix, index_zero, col_major, transpose_A, transpose_B,
+                             fp_one, set_fp_value<fpType>()(3.2f, 1.f), ldb, ldc, default_alg,
+                             default_A_view, default_properties, no_reset_data,
+                             no_scalars_on_device),
             num_passed, num_skipped);
         // Test 0 alpha
-            test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero,
-                             col_major, transpose_A, transpose_B, fp_zero, fp_one, ldb, ldc,
-                             default_alg, default_A_view, no_properties, no_reset_data,
-                             no_scalars_on_device),
+            test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C,
+                             density_A_matrix, index_zero, col_major, transpose_A, transpose_B,
+                             fp_zero, fp_one, ldb, ldc, default_alg, default_A_view,
+                             default_properties, no_reset_data, no_scalars_on_device),
             num_passed, num_skipped);
         // Test 0 alpha and beta
-            test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero,
-                             col_major, transpose_A, transpose_B, fp_zero, fp_zero, ldb, ldc,
-                             default_alg, default_A_view, no_properties, no_reset_data,
-                             no_scalars_on_device),
+            test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C,
+                             density_A_matrix, index_zero, col_major, transpose_A, transpose_B,
+                             fp_zero, fp_zero, ldb, ldc, default_alg, default_A_view,
+                             default_properties, no_reset_data, no_scalars_on_device),
             num_passed, num_skipped);
         // Test non-default ldb
-            test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero,
-                             col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb + 5, ldc,
-                             default_alg, default_A_view, no_properties, no_reset_data,
-                             no_scalars_on_device),
+            test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C,
+                             density_A_matrix, index_zero, col_major, transpose_A, transpose_B,
+                             fp_one, fp_zero, ldb + 5, ldc, default_alg, default_A_view,
+                             default_properties, no_reset_data, no_scalars_on_device),
             num_passed, num_skipped);
         // Test non-default ldc
-            test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero,
-                             col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc + 6,
-                             default_alg, default_A_view, no_properties, no_reset_data,
-                             no_scalars_on_device),
+            test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C,
+                             density_A_matrix, index_zero, col_major, transpose_A, transpose_B,
+                             fp_one, fp_zero, ldb, ldc + 6, default_alg, default_A_view,
+                             default_properties, no_reset_data, no_scalars_on_device),
             num_passed, num_skipped);
         // Test row major layout
-            test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero,
-                             oneapi::mkl::layout::row_major, transpose_A, transpose_B, fp_one,
-                             fp_zero, ncols_B, ncols_C, default_alg, default_A_view, no_properties,
-                             no_reset_data, no_scalars_on_device),
+            test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C,
+                             density_A_matrix, index_zero, oneapi::mkl::layout::row_major,
+                             transpose_A, transpose_B, fp_one, fp_zero, ncols_B, ncols_C,
+                             default_alg, default_A_view, default_properties, no_reset_data,
+                             no_scalars_on_device),
             num_passed, num_skipped);
         // Test int64 indices
         long long_nrows_A = 27, long_ncols_A = 13, long_ncols_C = 6;
         auto [long_ldc, long_ldb] = swap_if_transposed(transpose_A, long_nrows_A, long_ncols_A);
-            test_functor_i64(dev, format, long_nrows_A, long_ncols_A, long_ncols_C,
-                             density_A_matrix, index_zero, col_major, transpose_A, transpose_B,
-                             fp_one, fp_zero, long_ldb, long_ldc, default_alg, default_A_view,
-                             no_properties, no_reset_data, no_scalars_on_device),
+            test_functor_i64(dev, queue_properties, format, long_nrows_A, long_ncols_A,
+                             long_ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
+                             transpose_B, fp_one, fp_zero, long_ldb, long_ldc, default_alg,
+                             default_A_view, default_properties, no_reset_data,
+                             no_scalars_on_device),
             num_passed, num_skipped);
         // Test other algorithms
         for (auto alg : non_default_algorithms) {
-                test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix,
-                                 index_zero, col_major, transpose_A, transpose_B, fp_one, fp_zero,
-                                 ldb, ldc, alg, default_A_view, no_properties, no_reset_data,
-                                 no_scalars_on_device),
+                test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C,
+                                 density_A_matrix, index_zero, col_major, transpose_A, transpose_B,
+                                 fp_one, fp_zero, ldb, ldc, alg, default_A_view, default_properties,
+                                 no_reset_data, no_scalars_on_device),
                 num_passed, num_skipped);
         // Test matrix properties
-        for (auto properties : test_matrix_properties) {
+        for (auto properties : get_all_matrix_properties_combinations(properties_queue, format)) {
-                test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix,
-                                 index_zero, col_major, transpose_A, transpose_B, fp_one, fp_zero,
-                                 ldb, ldc, default_alg, default_A_view, properties, no_reset_data,
-                                 no_scalars_on_device),
+                test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C,
+                                 density_A_matrix, index_zero, col_major, transpose_A, transpose_B,
+                                 fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, properties,
+                                 no_reset_data, no_scalars_on_device),
                 num_passed, num_skipped);
+        // In-order queue
+            test_functor_i32(dev, { sycl::property::queue::in_order{} }, format, nrows_A, ncols_A,
+                             ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
+                             transpose_B, fp_one, fp_zero, ldb, ldc, default_alg, default_A_view,
+                             default_properties, no_reset_data, no_scalars_on_device),
+            num_passed, num_skipped);
         // Test different sizes
@@ -195,10 +211,10 @@ void test_helper_with_format_with_transpose(
         int ldb = nrows_B;
         int ldc = nrows_C;
-            test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero,
-                             col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc,
-                             default_alg, default_A_view, no_properties, no_reset_data,
-                             no_scalars_on_device),
+            test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C,
+                             density_A_matrix, index_zero, col_major, transpose_A, transpose_B,
+                             fp_one, fp_zero, ldb, ldc, default_alg, default_A_view,
+                             default_properties, no_reset_data, no_scalars_on_device),
             num_passed, num_skipped);
diff --git a/tests/unit_tests/sparse_blas/include/test_spmv.hpp b/tests/unit_tests/sparse_blas/include/test_spmv.hpp
index 654a1bfd4..50b5aa7db 100644
--- a/tests/unit_tests/sparse_blas/include/test_spmv.hpp
+++ b/tests/unit_tests/sparse_blas/include/test_spmv.hpp
@@ -56,6 +56,7 @@ void test_helper_with_format_with_transpose(
     sparse_matrix_format_t format,
     const std::vector<oneapi::mkl::sparse::spmv_alg>& non_default_algorithms,
     oneapi::mkl::transpose transpose_val, int& num_passed, int& num_skipped) {
+    sycl::property_list queue_properties;
     double density_A_matrix = 0.8;
     fpType fp_zero = set_fp_value<fpType>()(0.f, 0.f);
     fpType fp_one = set_fp_value<fpType>()(1.f, 0.f);
@@ -63,139 +64,152 @@ void test_helper_with_format_with_transpose(
     oneapi::mkl::index_base index_zero = oneapi::mkl::index_base::zero;
     oneapi::mkl::sparse::spmv_alg default_alg = oneapi::mkl::sparse::spmv_alg::default_alg;
     oneapi::mkl::sparse::matrix_view default_A_view;
-    std::set<oneapi::mkl::sparse::matrix_property> no_properties;
     bool no_reset_data = false;
     bool no_scalars_on_device = false;
+    // Queue is only used to get which matrix_property should be used for the tests.
+    sycl::queue properties_queue(*dev);
+    auto default_properties = get_default_matrix_properties(properties_queue, format);
     // Basic test
-        test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val,
-                         fp_one, fp_zero, default_alg, default_A_view, no_properties, no_reset_data,
-                         no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix,
+                         index_zero, transpose_val, fp_one, fp_zero, default_alg, default_A_view,
+                         default_properties, no_reset_data, no_scalars_on_device),
         num_passed, num_skipped);
     // Reset data
-        test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val,
-                         fp_one, fp_zero, default_alg, default_A_view, no_properties, true,
-                         no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix,
+                         index_zero, transpose_val, fp_one, fp_zero, default_alg, default_A_view,
+                         default_properties, true, no_scalars_on_device),
         num_passed, num_skipped);
     // Test alpha and beta on the device
-        test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val,
-                         fp_one, fp_zero, default_alg, default_A_view, no_properties, no_reset_data,
-                         true),
+        test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix,
+                         index_zero, transpose_val, fp_one, fp_zero, default_alg, default_A_view,
+                         default_properties, no_reset_data, true),
         num_passed, num_skipped);
     // Test index_base 1
-        test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix,
+        test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix,
                          oneapi::mkl::index_base::one, transpose_val, fp_one, fp_zero, default_alg,
-                         default_A_view, no_properties, no_reset_data, no_scalars_on_device),
+                         default_A_view, default_properties, no_reset_data, no_scalars_on_device),
         num_passed, num_skipped);
     // Test non-default alpha
-        test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val,
-                         set_fp_value<fpType>()(2.f, 1.5f), fp_zero, default_alg, default_A_view,
-                         no_properties, no_reset_data, no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix,
+                         index_zero, transpose_val, set_fp_value<fpType>()(2.f, 1.5f), fp_zero,
+                         default_alg, default_A_view, default_properties, no_reset_data,
+                         no_scalars_on_device),
         num_passed, num_skipped);
     // Test non-default beta
-        test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val,
-                         fp_one, set_fp_value<fpType>()(3.2f, 1.f), default_alg, default_A_view,
-                         no_properties, no_reset_data, no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix,
+                         index_zero, transpose_val, fp_one, set_fp_value<fpType>()(3.2f, 1.f),
+                         default_alg, default_A_view, default_properties, no_reset_data,
+                         no_scalars_on_device),
         num_passed, num_skipped);
     // Test 0 alpha
-        test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val,
-                         fp_zero, fp_one, default_alg, default_A_view, no_properties, no_reset_data,
-                         no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix,
+                         index_zero, transpose_val, fp_zero, fp_one, default_alg, default_A_view,
+                         default_properties, no_reset_data, no_scalars_on_device),
         num_passed, num_skipped);
     // Test 0 alpha and beta
-        test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val,
-                         fp_zero, fp_zero, default_alg, default_A_view, no_properties,
-                         no_reset_data, no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix,
+                         index_zero, transpose_val, fp_zero, fp_zero, default_alg, default_A_view,
+                         default_properties, no_reset_data, no_scalars_on_device),
         num_passed, num_skipped);
     // Test int64 indices
-        test_functor_i64(dev, format, 27L, 13L, density_A_matrix, index_zero, transpose_val, fp_one,
-                         fp_zero, default_alg, default_A_view, no_properties, no_reset_data,
-                         no_scalars_on_device),
+        test_functor_i64(dev, queue_properties, format, 27L, 13L, density_A_matrix, index_zero,
+                         transpose_val, fp_one, fp_zero, default_alg, default_A_view,
+                         default_properties, no_reset_data, no_scalars_on_device),
         num_passed, num_skipped);
     // Lower triangular
     oneapi::mkl::sparse::matrix_view triangular_A_view(
-        test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val,
-                         fp_one, fp_zero, default_alg, triangular_A_view, no_properties,
-                         no_reset_data, no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix,
+                         index_zero, transpose_val, fp_one, fp_zero, default_alg, triangular_A_view,
+                         default_properties, no_reset_data, no_scalars_on_device),
         num_passed, num_skipped);
     // Upper triangular
     triangular_A_view.uplo_view = oneapi::mkl::uplo::upper;
-        test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val,
-                         fp_one, fp_zero, default_alg, triangular_A_view, no_properties,
-                         no_reset_data, no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix,
+                         index_zero, transpose_val, fp_one, fp_zero, default_alg, triangular_A_view,
+                         default_properties, no_reset_data, no_scalars_on_device),
         num_passed, num_skipped);
     // Lower triangular unit diagonal
     oneapi::mkl::sparse::matrix_view triangular_unit_A_view(
     triangular_unit_A_view.diag_view = oneapi::mkl::diag::unit;
-        test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val,
-                         fp_one, fp_zero, default_alg, triangular_unit_A_view, no_properties,
-                         no_reset_data, no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix,
+                         index_zero, transpose_val, fp_one, fp_zero, default_alg,
+                         triangular_unit_A_view, default_properties, no_reset_data,
+                         no_scalars_on_device),
         num_passed, num_skipped);
     // Upper triangular unit diagonal
     triangular_A_view.uplo_view = oneapi::mkl::uplo::upper;
-        test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val,
-                         fp_one, fp_zero, default_alg, triangular_unit_A_view, no_properties,
-                         no_reset_data, no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix,
+                         index_zero, transpose_val, fp_one, fp_zero, default_alg,
+                         triangular_unit_A_view, default_properties, no_reset_data,
+                         no_scalars_on_device),
         num_passed, num_skipped);
     // Lower symmetric
     oneapi::mkl::sparse::matrix_view symmetric_view(oneapi::mkl::sparse::matrix_descr::symmetric);
-        test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val,
-                         fp_one, fp_zero, default_alg, symmetric_view, no_properties, no_reset_data,
-                         no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix,
+                         index_zero, transpose_val, fp_one, fp_zero, default_alg, symmetric_view,
+                         default_properties, no_reset_data, no_scalars_on_device),
         num_passed, num_skipped);
     // Upper symmetric
     symmetric_view.uplo_view = oneapi::mkl::uplo::upper;
-        test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val,
-                         fp_one, fp_zero, default_alg, symmetric_view, no_properties, no_reset_data,
-                         no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix,
+                         index_zero, transpose_val, fp_one, fp_zero, default_alg, symmetric_view,
+                         default_properties, no_reset_data, no_scalars_on_device),
         num_passed, num_skipped);
     // Lower hermitian
     oneapi::mkl::sparse::matrix_view hermitian_view(oneapi::mkl::sparse::matrix_descr::hermitian);
-        test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val,
-                         fp_one, fp_zero, default_alg, hermitian_view, no_properties, no_reset_data,
-                         no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix,
+                         index_zero, transpose_val, fp_one, fp_zero, default_alg, hermitian_view,
+                         default_properties, no_reset_data, no_scalars_on_device),
         num_passed, num_skipped);
     // Upper hermitian
     hermitian_view.uplo_view = oneapi::mkl::uplo::upper;
-        test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val,
-                         fp_one, fp_zero, default_alg, hermitian_view, no_properties, no_reset_data,
-                         no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix,
+                         index_zero, transpose_val, fp_one, fp_zero, default_alg, hermitian_view,
+                         default_properties, no_reset_data, no_scalars_on_device),
         num_passed, num_skipped);
     // Test other algorithms
     for (auto alg : non_default_algorithms) {
-            test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero,
-                             transpose_val, fp_one, fp_zero, alg, default_A_view, no_properties,
-                             no_reset_data, no_scalars_on_device),
+            test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix,
+                             index_zero, transpose_val, fp_one, fp_zero, alg, default_A_view,
+                             default_properties, no_reset_data, no_scalars_on_device),
             num_passed, num_skipped);
     // Test matrix properties
-    for (auto properties : test_matrix_properties) {
+    for (auto properties : get_all_matrix_properties_combinations(properties_queue, format)) {
-            test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero,
-                             transpose_val, fp_one, fp_zero, default_alg, default_A_view,
-                             properties, no_reset_data, no_scalars_on_device),
+            test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix,
+                             index_zero, transpose_val, fp_one, fp_zero, default_alg,
+                             default_A_view, properties, no_reset_data, no_scalars_on_device),
             num_passed, num_skipped);
+    // In-order queue
+        test_functor_i32(dev, { sycl::property::queue::in_order{} }, format, nrows_A, ncols_A,
+                         density_A_matrix, index_zero, transpose_val, fp_one, fp_zero, default_alg,
+                         default_A_view, default_properties, no_reset_data, no_scalars_on_device),
+        num_passed, num_skipped);
diff --git a/tests/unit_tests/sparse_blas/include/test_spsv.hpp b/tests/unit_tests/sparse_blas/include/test_spsv.hpp
index 032a0875b..94f5eacb1 100644
--- a/tests/unit_tests/sparse_blas/include/test_spsv.hpp
+++ b/tests/unit_tests/sparse_blas/include/test_spsv.hpp
@@ -51,6 +51,7 @@ void test_helper_with_format(testFunctorI32 test_functor_i32, testFunctorI64 tes
                              sycl::device* dev, sparse_matrix_format_t format,
                              oneapi::mkl::transpose transpose_val, int& num_passed,
                              int& num_skipped) {
+    sycl::property_list queue_properties;
     double density_A_matrix = 0.144;
     fpType alpha = set_fp_value<fpType>()(1.f, 0.f);
     int m = 277;
@@ -60,89 +61,104 @@ void test_helper_with_format(testFunctorI32 test_functor_i32, testFunctorI64 tes
     oneapi::mkl::sparse::matrix_view default_A_view(oneapi::mkl::sparse::matrix_descr::triangular);
     oneapi::mkl::sparse::matrix_view upper_A_view(oneapi::mkl::sparse::matrix_descr::triangular);
     upper_A_view.uplo_view = oneapi::mkl::uplo::upper;
-    std::set<oneapi::mkl::sparse::matrix_property> no_properties;
     bool no_reset_data = false;
     bool no_scalars_on_device = false;
+    // Queue is only used to get which matrix_property should be used for the tests.
+    sycl::queue properties_queue(*dev);
+    auto default_properties = get_default_matrix_properties(properties_queue, format);
     // Basic test
-    EXPECT_TRUE_OR_FUTURE_SKIP(test_functor_i32(dev, format, m, density_A_matrix, index_zero,
-                                                transpose_val, alpha, default_alg, default_A_view,
-                                                no_properties, no_reset_data, no_scalars_on_device),
-                               num_passed, num_skipped);
+        test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero,
+                         transpose_val, alpha, default_alg, default_A_view, default_properties,
+                         no_reset_data, no_scalars_on_device),
+        num_passed, num_skipped);
     // Reset data
-        test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha,
-                         default_alg, default_A_view, no_properties, true, no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero,
+                         transpose_val, alpha, default_alg, default_A_view, default_properties,
+                         true, no_scalars_on_device),
         num_passed, num_skipped);
     // Test alpha on the device
-        test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha,
-                         default_alg, default_A_view, no_properties, no_reset_data, true),
+        test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero,
+                         transpose_val, alpha, default_alg, default_A_view, default_properties,
+                         no_reset_data, true),
         num_passed, num_skipped);
     // Test index_base 1
-        test_functor_i32(dev, format, m, density_A_matrix, oneapi::mkl::index_base::one,
-                         transpose_val, alpha, default_alg, default_A_view, no_properties,
-                         no_reset_data, no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, m, density_A_matrix,
+                         oneapi::mkl::index_base::one, transpose_val, alpha, default_alg,
+                         default_A_view, default_properties, no_reset_data, no_scalars_on_device),
         num_passed, num_skipped);
     // Test upper triangular matrix
-    EXPECT_TRUE_OR_FUTURE_SKIP(test_functor_i32(dev, format, m, density_A_matrix, index_zero,
-                                                transpose_val, alpha, default_alg, upper_A_view,
-                                                no_properties, no_reset_data, no_scalars_on_device),
-                               num_passed, num_skipped);
+        test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero,
+                         transpose_val, alpha, default_alg, upper_A_view, default_properties,
+                         no_reset_data, no_scalars_on_device),
+        num_passed, num_skipped);
     // Test lower triangular unit diagonal matrix
     oneapi::mkl::sparse::matrix_view triangular_unit_A_view(
     triangular_unit_A_view.diag_view = oneapi::mkl::diag::unit;
-        test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha,
-                         default_alg, triangular_unit_A_view, no_properties, no_reset_data,
-                         no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero,
+                         transpose_val, alpha, default_alg, triangular_unit_A_view,
+                         default_properties, no_reset_data, no_scalars_on_device),
         num_passed, num_skipped);
     // Test upper triangular unit diagonal matrix
     triangular_unit_A_view.uplo_view = oneapi::mkl::uplo::upper;
-        test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha,
-                         default_alg, triangular_unit_A_view, no_properties, no_reset_data,
-                         no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero,
+                         transpose_val, alpha, default_alg, triangular_unit_A_view,
+                         default_properties, no_reset_data, no_scalars_on_device),
         num_passed, num_skipped);
     // Test non-default alpha
-        test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val,
-                         set_fp_value<fpType>()(2.f, 1.5f), default_alg, default_A_view,
-                         no_properties, no_reset_data, no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero,
+                         transpose_val, set_fp_value<fpType>()(2.f, 1.5f), default_alg,
+                         default_A_view, default_properties, no_reset_data, no_scalars_on_device),
         num_passed, num_skipped);
     // Test int64 indices
-    EXPECT_TRUE_OR_FUTURE_SKIP(test_functor_i64(dev, format, 15L, density_A_matrix, index_zero,
-                                                transpose_val, alpha, default_alg, default_A_view,
-                                                no_properties, no_reset_data, no_scalars_on_device),
-                               num_passed, num_skipped);
+        test_functor_i64(dev, queue_properties, format, 15L, density_A_matrix, index_zero,
+                         transpose_val, alpha, default_alg, default_A_view, default_properties,
+                         no_reset_data, no_scalars_on_device),
+        num_passed, num_skipped);
     // Test lower no_optimize_alg
-        test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha,
-                         no_optimize_alg, default_A_view, no_properties, no_reset_data,
-                         no_scalars_on_device),
+        test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero,
+                         transpose_val, alpha, no_optimize_alg, default_A_view, default_properties,
+                         no_reset_data, no_scalars_on_device),
         num_passed, num_skipped);
     // Test upper no_optimize_alg
-    EXPECT_TRUE_OR_FUTURE_SKIP(test_functor_i32(dev, format, m, density_A_matrix, index_zero,
-                                                transpose_val, alpha, no_optimize_alg, upper_A_view,
-                                                no_properties, no_reset_data, no_scalars_on_device),
-                               num_passed, num_skipped);
+        test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero,
+                         transpose_val, alpha, no_optimize_alg, upper_A_view, default_properties,
+                         no_reset_data, no_scalars_on_device),
+        num_passed, num_skipped);
     // Test matrix properties
-    for (auto properties : test_matrix_properties) {
+    for (auto properties : get_all_matrix_properties_combinations(properties_queue, format)) {
         // Basic test with matrix properties
-            test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha,
-                             default_alg, default_A_view, properties, no_reset_data,
-                             no_scalars_on_device),
+            test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero,
+                             transpose_val, alpha, default_alg, default_A_view, properties,
+                             no_reset_data, no_scalars_on_device),
             num_passed, num_skipped);
         // Test lower no_optimize_alg with matrix properties
-            test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha,
-                             no_optimize_alg, default_A_view, properties, no_reset_data,
-                             no_scalars_on_device),
+            test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero,
+                             transpose_val, alpha, no_optimize_alg, default_A_view, properties,
+                             no_reset_data, no_scalars_on_device),
             num_passed, num_skipped);
+    // In-order queue
+        test_functor_i32(dev, { sycl::property::queue::in_order{} }, format, m, density_A_matrix,
+                         index_zero, transpose_val, alpha, default_alg, default_A_view,
+                         default_properties, no_reset_data, no_scalars_on_device),
+        num_passed, num_skipped);
diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp
index 0d95630bf..50f0fb2e7 100644
--- a/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp
+++ b/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp
@@ -28,8 +28,9 @@ extern std::vector<sycl::device*> devices;
 namespace {
 template <typename fpType, typename intType>
-int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, intType ncols_A,
-              intType ncols_C, double density_A_matrix, oneapi::mkl::index_base index,
+int test_spmm(sycl::device* dev, sycl::property_list queue_properties,
+              sparse_matrix_format_t format, intType nrows_A, intType ncols_A, intType ncols_C,
+              double density_A_matrix, oneapi::mkl::index_base index,
               oneapi::mkl::layout dense_matrix_layout, oneapi::mkl::transpose transpose_A,
               oneapi::mkl::transpose transpose_B, fpType alpha, fpType beta, intType ldb,
               intType ldc, oneapi::mkl::sparse::spmm_alg alg,
@@ -40,7 +41,7 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A,
         // Scalars on the device is not planned to be supported with the buffer API
         return 1;
-    sycl::queue main_queue(*dev, exception_handler_t());
+    sycl::queue main_queue(*dev, exception_handler_t(), queue_properties);
     if (require_square_matrix(A_view, matrix_properties)) {
         ncols_A = nrows_A;
@@ -51,8 +52,6 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A,
     auto [opa_nrows, opa_ncols] = swap_if_transposed<std::size_t>(transpose_A, nrows_A, ncols_A);
     auto [opb_nrows, opb_ncols] = swap_if_transposed<std::int64_t>(transpose_B, opa_ncols, ncols_C);
     intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1;
-    const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) !=
-                           matrix_properties.cend();
     const bool is_symmetric =
         matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) !=
@@ -73,10 +72,9 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A,
     std::vector<fpType> c_ref_host(c_host);
     // Shuffle ordering of column indices/values to test sortedness
-    if (!is_sorted) {
-        shuffle_sparse_matrix(format, indexing,,,, nnz,
-                              static_cast<std::size_t>(nrows_A));
-    }
+    shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing,,
+                          ,, nnz,
+                                    static_cast<std::size_t>(nrows_A));
     auto ia_buf = make_buffer(ia_host);
     auto ja_buf = make_buffer(ja_host);
@@ -119,10 +117,9 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A,
             intType reset_nnz = generate_random_matrix<fpType, intType>(
                 format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host,
-            if (!is_sorted) {
-                shuffle_sparse_matrix(format, indexing,,,
-                            , reset_nnz, static_cast<std::size_t>(nrows_A));
-            }
+            shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing,,
+                                  ,, reset_nnz,
+                                            static_cast<std::size_t>(nrows_A));
             if (reset_nnz > nnz) {
                 ia_buf = make_buffer(ia_host);
                 ja_buf = make_buffer(ja_host);
diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp
index 3f09594eb..1db7c7a25 100644
--- a/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp
+++ b/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp
@@ -28,15 +28,16 @@ extern std::vector<sycl::device*> devices;
 namespace {
 template <typename fpType, typename intType>
-int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, intType ncols_A,
-              intType ncols_C, double density_A_matrix, oneapi::mkl::index_base index,
+int test_spmm(sycl::device* dev, sycl::property_list queue_properties,
+              sparse_matrix_format_t format, intType nrows_A, intType ncols_A, intType ncols_C,
+              double density_A_matrix, oneapi::mkl::index_base index,
               oneapi::mkl::layout dense_matrix_layout, oneapi::mkl::transpose transpose_A,
               oneapi::mkl::transpose transpose_B, fpType alpha, fpType beta, intType ldb,
               intType ldc, oneapi::mkl::sparse::spmm_alg alg,
               oneapi::mkl::sparse::matrix_view A_view,
               const std::set<oneapi::mkl::sparse::matrix_property>& matrix_properties,
               bool reset_data, bool test_scalar_on_device) {
-    sycl::queue main_queue(*dev, exception_handler_t());
+    sycl::queue main_queue(*dev, exception_handler_t(), queue_properties);
     if (require_square_matrix(A_view, matrix_properties)) {
         ncols_A = nrows_A;
@@ -47,8 +48,6 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A,
     auto [opa_nrows, opa_ncols] = swap_if_transposed<std::size_t>(transpose_A, nrows_A, ncols_A);
     auto [opb_nrows, opb_ncols] = swap_if_transposed<std::int64_t>(transpose_B, opa_ncols, ncols_C);
     intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1;
-    const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) !=
-                           matrix_properties.cend();
     const bool is_symmetric =
         matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) !=
@@ -69,10 +68,9 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A,
     std::vector<fpType> c_ref_host(c_host);
     // Shuffle ordering of column indices/values to test sortedness
-    if (!is_sorted) {
-        shuffle_sparse_matrix(format, indexing,,,, nnz,
-                              static_cast<std::size_t>(nrows_A));
-    }
+    shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing,,
+                          ,, nnz,
+                                    static_cast<std::size_t>(nrows_A));
     auto ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size());
     auto ja_usm_uptr = malloc_device_uptr<intType>(main_queue, ja_host.size());
@@ -88,26 +86,21 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A,
     fpType* b_usm = b_usm_uptr.get();
     fpType* c_usm = c_usm_uptr.get();
-    std::vector<sycl::event> mat_dependencies;
-    std::vector<sycl::event> spmm_dependencies;
+    std::vector<sycl::event> dependencies;
     // Copy host to device
-    mat_dependencies.push_back(
+    dependencies.push_back(
         main_queue.memcpy(ia_usm,, ia_host.size() * sizeof(intType)));
-    mat_dependencies.push_back(
+    dependencies.push_back(
         main_queue.memcpy(ja_usm,, ja_host.size() * sizeof(intType)));
-    mat_dependencies.push_back(
-        main_queue.memcpy(a_usm,, a_host.size() * sizeof(fpType)));
-    spmm_dependencies.push_back(
-        main_queue.memcpy(b_usm,, b_host.size() * sizeof(fpType)));
-    spmm_dependencies.push_back(
-        main_queue.memcpy(c_usm,, c_host.size() * sizeof(fpType)));
+    dependencies.push_back(main_queue.memcpy(a_usm,, a_host.size() * sizeof(fpType)));
+    dependencies.push_back(main_queue.memcpy(b_usm,, b_host.size() * sizeof(fpType)));
+    dependencies.push_back(main_queue.memcpy(c_usm,, c_host.size() * sizeof(fpType)));
     fpType* alpha_host_or_usm_ptr = &alpha;
     fpType* beta_host_or_usm_ptr = &beta;
     if (test_scalar_on_device) {
-        spmm_dependencies.push_back(
-            main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType)));
-        spmm_dependencies.push_back(main_queue.memcpy(beta_usm_uptr.get(), &beta, sizeof(fpType)));
+        dependencies.push_back(main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType)));
+        dependencies.push_back(main_queue.memcpy(beta_usm_uptr.get(), &beta, sizeof(fpType)));
         alpha_host_or_usm_ptr = alpha_usm_uptr.get();
         beta_host_or_usm_ptr = beta_usm_uptr.get();
@@ -141,24 +134,20 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A,
         sycl::event ev_opt;
         CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::spmm_optimize, main_queue, transpose_A,
                       transpose_B, &alpha, A_view, A_handle, B_handle, &beta, C_handle, alg, descr,
-                      workspace_usm.get(), mat_dependencies);
+                      workspace_usm.get(), dependencies);
-        spmm_dependencies.push_back(ev_opt);
         CALL_RT_OR_CT(ev_spmm = oneapi::mkl::sparse::spmm, main_queue, transpose_A, transpose_B,
-                      &alpha, A_view, A_handle, B_handle, &beta, C_handle, alg, descr,
-                      spmm_dependencies);
+                      &alpha, A_view, A_handle, B_handle, &beta, C_handle, alg, descr, { ev_opt });
         if (reset_data) {
             intType reset_nnz = generate_random_matrix<fpType, intType>(
                 format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host,
-            if (!is_sorted) {
-                shuffle_sparse_matrix(format, indexing,,,
-                            , reset_nnz, static_cast<std::size_t>(nrows_A));
-            }
+            shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing,,
+                                  ,, reset_nnz,
+                                            static_cast<std::size_t>(nrows_A));
+            ev_spmm.wait_and_throw();
             if (reset_nnz > nnz) {
-                // Wait before freeing usm pointers
-                ev_spmm.wait_and_throw();
                 ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size());
                 ja_usm_uptr = malloc_device_uptr<intType>(main_queue, ja_host.size());
                 a_usm_uptr = malloc_device_uptr<fpType>(main_queue, a_host.size());
@@ -168,14 +157,14 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A,
             nnz = reset_nnz;
-            mat_dependencies.clear();
-            mat_dependencies.push_back(main_queue.memcpy(
-                ia_usm,, ia_host.size() * sizeof(intType), ev_spmm));
-            mat_dependencies.push_back(main_queue.memcpy(
-                ja_usm,, ja_host.size() * sizeof(intType), ev_spmm));
-            mat_dependencies.push_back(
+            dependencies.clear();
+            dependencies.push_back(main_queue.memcpy(ia_usm,,
+                                                     ia_host.size() * sizeof(intType), ev_spmm));
+            dependencies.push_back(main_queue.memcpy(ja_usm,,
+                                                     ja_host.size() * sizeof(intType), ev_spmm));
+            dependencies.push_back(
                 main_queue.memcpy(a_usm,, a_host.size() * sizeof(fpType), ev_spmm));
-            mat_dependencies.push_back(
+            dependencies.push_back(
                 main_queue.memcpy(c_usm,, c_host.size() * sizeof(fpType), ev_spmm));
             set_matrix_data(main_queue, format, A_handle, nrows_A, ncols_A, nnz, index, ia_usm,
                             ja_usm, a_usm);
@@ -190,7 +179,7 @@ int test_spmm(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A,
             CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::spmm_optimize, main_queue, transpose_A,
                           transpose_B, &alpha, A_view, A_handle, B_handle, &beta, C_handle, alg,
-                          descr, workspace_usm.get(), mat_dependencies);
+                          descr, workspace_usm.get(), dependencies);
             CALL_RT_OR_CT(ev_spmm = oneapi::mkl::sparse::spmm, main_queue, transpose_A, transpose_B,
                           &alpha, A_view, A_handle, B_handle, &beta, C_handle, alg, descr,
diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp
index 1864f6065..96328372d 100644
--- a/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp
+++ b/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp
@@ -28,7 +28,8 @@ extern std::vector<sycl::device*> devices;
 namespace {
 template <typename fpType, typename intType>
-int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, intType ncols_A,
+int test_spmv(sycl::device* dev, sycl::property_list queue_properties,
+              sparse_matrix_format_t format, intType nrows_A, intType ncols_A,
               double density_A_matrix, oneapi::mkl::index_base index,
               oneapi::mkl::transpose transpose_val, fpType alpha, fpType beta,
               oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::matrix_view A_view,
@@ -38,15 +39,13 @@ int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A,
         // Scalars on the device is not planned to be supported with the buffer API
         return 1;
-    sycl::queue main_queue(*dev, exception_handler_t());
+    sycl::queue main_queue(*dev, exception_handler_t(), queue_properties);
     if (require_square_matrix(A_view, matrix_properties)) {
         ncols_A = nrows_A;
     auto [opa_nrows, opa_ncols] = swap_if_transposed<std::size_t>(transpose_val, nrows_A, ncols_A);
     intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1;
-    const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) !=
-                           matrix_properties.cend();
     const bool is_symmetric =
         matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) !=
@@ -66,10 +65,9 @@ int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A,
     std::vector<fpType> y_ref_host(y_host);
     // Shuffle ordering of column indices/values to test sortedness
-    if (!is_sorted) {
-        shuffle_sparse_matrix(format, indexing,,,, nnz,
-                              static_cast<std::size_t>(nrows_A));
-    }
+    shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing,,
+                          ,, nnz,
+                                    static_cast<std::size_t>(nrows_A));
     auto ia_buf = make_buffer(ia_host);
     auto ja_buf = make_buffer(ja_host);
@@ -109,10 +107,9 @@ int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A,
             intType reset_nnz = generate_random_matrix<fpType, intType>(
                 format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host,
-            if (!is_sorted) {
-                shuffle_sparse_matrix(format, indexing,,,
-                            , reset_nnz, static_cast<std::size_t>(nrows_A));
-            }
+            shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing,,
+                                  ,, reset_nnz,
+                                            static_cast<std::size_t>(nrows_A));
             if (reset_nnz > nnz) {
                 ia_buf = make_buffer(ia_host);
                 ja_buf = make_buffer(ja_host);
diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp
index b24a6e0ee..c6159aaf4 100644
--- a/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp
+++ b/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp
@@ -28,21 +28,20 @@ extern std::vector<sycl::device*> devices;
 namespace {
 template <typename fpType, typename intType>
-int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A, intType ncols_A,
+int test_spmv(sycl::device* dev, sycl::property_list queue_properties,
+              sparse_matrix_format_t format, intType nrows_A, intType ncols_A,
               double density_A_matrix, oneapi::mkl::index_base index,
               oneapi::mkl::transpose transpose_val, fpType alpha, fpType beta,
               oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::matrix_view A_view,
               const std::set<oneapi::mkl::sparse::matrix_property>& matrix_properties,
               bool reset_data, bool test_scalar_on_device) {
-    sycl::queue main_queue(*dev, exception_handler_t());
+    sycl::queue main_queue(*dev, exception_handler_t(), queue_properties);
     if (require_square_matrix(A_view, matrix_properties)) {
         ncols_A = nrows_A;
     auto [opa_nrows, opa_ncols] = swap_if_transposed<std::size_t>(transpose_val, nrows_A, ncols_A);
     intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1;
-    const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) !=
-                           matrix_properties.cend();
     const bool is_symmetric =
         matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) !=
@@ -62,10 +61,9 @@ int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A,
     std::vector<fpType> y_ref_host(y_host);
     // Shuffle ordering of column indices/values to test sortedness
-    if (!is_sorted) {
-        shuffle_sparse_matrix(format, indexing,,,, nnz,
-                              static_cast<std::size_t>(nrows_A));
-    }
+    shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing,,
+                          ,, nnz,
+                                    static_cast<std::size_t>(nrows_A));
     auto ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size());
     auto ja_usm_uptr = malloc_device_uptr<intType>(main_queue, ja_host.size());
@@ -81,26 +79,21 @@ int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A,
     fpType* x_usm = x_usm_uptr.get();
     fpType* y_usm = y_usm_uptr.get();
-    std::vector<sycl::event> mat_dependencies;
-    std::vector<sycl::event> spmv_dependencies;
+    std::vector<sycl::event> dependencies;
     // Copy host to device
-    mat_dependencies.push_back(
+    dependencies.push_back(
         main_queue.memcpy(ia_usm,, ia_host.size() * sizeof(intType)));
-    mat_dependencies.push_back(
+    dependencies.push_back(
         main_queue.memcpy(ja_usm,, ja_host.size() * sizeof(intType)));
-    mat_dependencies.push_back(
-        main_queue.memcpy(a_usm,, a_host.size() * sizeof(fpType)));
-    spmv_dependencies.push_back(
-        main_queue.memcpy(x_usm,, x_host.size() * sizeof(fpType)));
-    spmv_dependencies.push_back(
-        main_queue.memcpy(y_usm,, y_host.size() * sizeof(fpType)));
+    dependencies.push_back(main_queue.memcpy(a_usm,, a_host.size() * sizeof(fpType)));
+    dependencies.push_back(main_queue.memcpy(x_usm,, x_host.size() * sizeof(fpType)));
+    dependencies.push_back(main_queue.memcpy(y_usm,, y_host.size() * sizeof(fpType)));
     fpType* alpha_host_or_usm_ptr = &alpha;
     fpType* beta_host_or_usm_ptr = &beta;
     if (test_scalar_on_device) {
-        spmv_dependencies.push_back(
-            main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType)));
-        spmv_dependencies.push_back(main_queue.memcpy(beta_usm_uptr.get(), &beta, sizeof(fpType)));
+        dependencies.push_back(main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType)));
+        dependencies.push_back(main_queue.memcpy(beta_usm_uptr.get(), &beta, sizeof(fpType)));
         alpha_host_or_usm_ptr = alpha_usm_uptr.get();
         beta_host_or_usm_ptr = beta_usm_uptr.get();
@@ -133,24 +126,21 @@ int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A,
         sycl::event ev_opt;
         CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::spmv_optimize, main_queue, transpose_val,
                       alpha_host_or_usm_ptr, A_view, A_handle, x_handle, beta_host_or_usm_ptr,
-                      y_handle, alg, descr, workspace_usm.get(), mat_dependencies);
+                      y_handle, alg, descr, workspace_usm.get(), dependencies);
-        spmv_dependencies.push_back(ev_opt);
         CALL_RT_OR_CT(ev_spmv = oneapi::mkl::sparse::spmv, main_queue, transpose_val,
                       alpha_host_or_usm_ptr, A_view, A_handle, x_handle, beta_host_or_usm_ptr,
-                      y_handle, alg, descr, spmv_dependencies);
+                      y_handle, alg, descr, { ev_opt });
         if (reset_data) {
             intType reset_nnz = generate_random_matrix<fpType, intType>(
                 format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host,
-            if (!is_sorted) {
-                shuffle_sparse_matrix(format, indexing,,,
-                            , reset_nnz, static_cast<std::size_t>(nrows_A));
-            }
+            shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing,,
+                                  ,, reset_nnz,
+                                            static_cast<std::size_t>(nrows_A));
+            ev_spmv.wait_and_throw();
             if (reset_nnz > nnz) {
-                // Wait before freeing usm pointers
-                ev_spmv.wait_and_throw();
                 ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size());
                 ja_usm_uptr = malloc_device_uptr<intType>(main_queue, ja_host.size());
                 a_usm_uptr = malloc_device_uptr<fpType>(main_queue, a_host.size());
@@ -160,14 +150,14 @@ int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A,
             nnz = reset_nnz;
-            mat_dependencies.clear();
-            mat_dependencies.push_back(main_queue.memcpy(
-                ia_usm,, ia_host.size() * sizeof(intType), ev_spmv));
-            mat_dependencies.push_back(main_queue.memcpy(
-                ja_usm,, ja_host.size() * sizeof(intType), ev_spmv));
-            mat_dependencies.push_back(
+            dependencies.clear();
+            dependencies.push_back(main_queue.memcpy(ia_usm,,
+                                                     ia_host.size() * sizeof(intType), ev_spmv));
+            dependencies.push_back(main_queue.memcpy(ja_usm,,
+                                                     ja_host.size() * sizeof(intType), ev_spmv));
+            dependencies.push_back(
                 main_queue.memcpy(a_usm,, a_host.size() * sizeof(fpType), ev_spmv));
-            mat_dependencies.push_back(
+            dependencies.push_back(
                 main_queue.memcpy(y_usm,, y_host.size() * sizeof(fpType), ev_spmv));
             set_matrix_data(main_queue, format, A_handle, nrows_A, ncols_A, nnz, index, ia_usm,
                             ja_usm, a_usm);
@@ -182,7 +172,7 @@ int test_spmv(sycl::device* dev, sparse_matrix_format_t format, intType nrows_A,
             CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::spmv_optimize, main_queue, transpose_val,
                           alpha_host_or_usm_ptr, A_view, A_handle, x_handle, beta_host_or_usm_ptr,
-                          y_handle, alg, descr, workspace_usm.get(), mat_dependencies);
+                          y_handle, alg, descr, workspace_usm.get(), dependencies);
             CALL_RT_OR_CT(ev_spmv = oneapi::mkl::sparse::spmv, main_queue, transpose_val,
                           alpha_host_or_usm_ptr, A_view, A_handle, x_handle, beta_host_or_usm_ptr,
diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp
index ca5689d13..19c237dc0 100644
--- a/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp
+++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp
@@ -28,7 +28,8 @@ extern std::vector<sycl::device*> devices;
 namespace {
 template <typename fpType, typename intType>
-int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, double density_A_matrix,
+int test_spsv(sycl::device* dev, sycl::property_list queue_properties,
+              sparse_matrix_format_t format, intType m, double density_A_matrix,
               oneapi::mkl::index_base index, oneapi::mkl::transpose transpose_val, fpType alpha,
               oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::matrix_view A_view,
               const std::set<oneapi::mkl::sparse::matrix_property>& matrix_properties,
@@ -37,16 +38,17 @@ int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, doubl
         // Scalars on the device is not planned to be supported with the buffer API
         return 1;
-    sycl::queue main_queue(*dev, exception_handler_t());
+    sycl::queue main_queue(*dev, exception_handler_t(), queue_properties);
     intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1;
     const std::size_t mu = static_cast<std::size_t>(m);
-    const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) !=
-                           matrix_properties.cend();
     const bool is_symmetric =
         matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) !=
+    // Use a fixed seed for operations very sensitive to the input data
+    std::srand(1);
     // Input matrix
     std::vector<intType> ia_host, ja_host;
     std::vector<fpType> a_host;
@@ -69,10 +71,8 @@ int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, doubl
     std::vector<fpType> y_ref_host(y_host);
     // Shuffle ordering of column indices/values to test sortedness
-    if (!is_sorted) {
-        shuffle_sparse_matrix(format, indexing,,,, nnz,
-                              mu);
-    }
+    shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing,,
+                          ,, nnz, mu);
     auto ia_buf = make_buffer(ia_host);
     auto ja_buf = make_buffer(ja_host);
@@ -109,10 +109,8 @@ int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, doubl
             intType reset_nnz = generate_random_matrix<fpType, intType>(
                 format, m, m, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric,
-            if (!is_sorted) {
-                shuffle_sparse_matrix(format, indexing,,,
-                            , reset_nnz, mu);
-            }
+            shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing,,
+                                  ,, reset_nnz, mu);
             if (reset_nnz > nnz) {
                 ia_buf = make_buffer(ia_host);
                 ja_buf = make_buffer(ja_host);
diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp
index 7a43a7112..68023591b 100644
--- a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp
+++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp
@@ -28,21 +28,23 @@ extern std::vector<sycl::device*> devices;
 namespace {
 template <typename fpType, typename intType>
-int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, double density_A_matrix,
+int test_spsv(sycl::device* dev, sycl::property_list queue_properties,
+              sparse_matrix_format_t format, intType m, double density_A_matrix,
               oneapi::mkl::index_base index, oneapi::mkl::transpose transpose_val, fpType alpha,
               oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::matrix_view A_view,
               const std::set<oneapi::mkl::sparse::matrix_property>& matrix_properties,
               bool reset_data, bool test_scalar_on_device) {
-    sycl::queue main_queue(*dev, exception_handler_t());
+    sycl::queue main_queue(*dev, exception_handler_t(), queue_properties);
     intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1;
     const std::size_t mu = static_cast<std::size_t>(m);
-    const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) !=
-                           matrix_properties.cend();
     const bool is_symmetric =
         matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) !=
+    // Use a fixed seed for operations very sensitive to the input data
+    std::srand(1);
     // Input matrix
     std::vector<intType> ia_host, ja_host;
     std::vector<fpType> a_host;
@@ -65,10 +67,8 @@ int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, doubl
     std::vector<fpType> y_ref_host(y_host);
     // Shuffle ordering of column indices/values to test sortedness
-    if (!is_sorted) {
-        shuffle_sparse_matrix(format, indexing,,,, nnz,
-                              mu);
-    }
+    shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing,,
+                          ,, nnz, mu);
     auto ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size());
     auto ja_usm_uptr = malloc_device_uptr<intType>(main_queue, ja_host.size());
@@ -83,24 +83,19 @@ int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, doubl
     fpType* x_usm = x_usm_uptr.get();
     fpType* y_usm = y_usm_uptr.get();
-    std::vector<sycl::event> mat_dependencies;
-    std::vector<sycl::event> spsv_dependencies;
+    std::vector<sycl::event> dependencies;
     // Copy host to device
-    mat_dependencies.push_back(
+    dependencies.push_back(
         main_queue.memcpy(ia_usm,, ia_host.size() * sizeof(intType)));
-    mat_dependencies.push_back(
+    dependencies.push_back(
         main_queue.memcpy(ja_usm,, ja_host.size() * sizeof(intType)));
-    mat_dependencies.push_back(
-        main_queue.memcpy(a_usm,, a_host.size() * sizeof(fpType)));
-    spsv_dependencies.push_back(
-        main_queue.memcpy(x_usm,, x_host.size() * sizeof(fpType)));
-    spsv_dependencies.push_back(
-        main_queue.memcpy(y_usm,, y_host.size() * sizeof(fpType)));
+    dependencies.push_back(main_queue.memcpy(a_usm,, a_host.size() * sizeof(fpType)));
+    dependencies.push_back(main_queue.memcpy(x_usm,, x_host.size() * sizeof(fpType)));
+    dependencies.push_back(main_queue.memcpy(y_usm,, y_host.size() * sizeof(fpType)));
     fpType* alpha_host_or_usm_ptr = &alpha;
     if (test_scalar_on_device) {
-        spsv_dependencies.push_back(
-            main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType)));
+        dependencies.push_back(main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType)));
         alpha_host_or_usm_ptr = alpha_usm_uptr.get();
@@ -129,24 +124,20 @@ int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, doubl
         sycl::event ev_opt;
         CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::spsv_optimize, main_queue, transpose_val,
                       alpha_host_or_usm_ptr, A_view, A_handle, x_handle, y_handle, alg, descr,
-                      workspace_usm.get(), mat_dependencies);
+                      workspace_usm.get(), dependencies);
-        spsv_dependencies.push_back(ev_opt);
         CALL_RT_OR_CT(ev_spsv = oneapi::mkl::sparse::spsv, main_queue, transpose_val,
                       alpha_host_or_usm_ptr, A_view, A_handle, x_handle, y_handle, alg, descr,
-                      spsv_dependencies);
+                      { ev_opt });
         if (reset_data) {
             intType reset_nnz = generate_random_matrix<fpType, intType>(
                 format, m, m, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric,
-            if (!is_sorted) {
-                shuffle_sparse_matrix(format, indexing,,,
-                            , reset_nnz, mu);
-            }
+            shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing,,
+                                  ,, reset_nnz, mu);
+            ev_spsv.wait_and_throw();
             if (reset_nnz > nnz) {
-                // Wait before freeing usm pointers
-                ev_spsv.wait_and_throw();
                 ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size());
                 ja_usm_uptr = malloc_device_uptr<intType>(main_queue, ja_host.size());
                 a_usm_uptr = malloc_device_uptr<fpType>(main_queue, a_host.size());
@@ -156,14 +147,14 @@ int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, doubl
             nnz = reset_nnz;
-            mat_dependencies.clear();
-            mat_dependencies.push_back(main_queue.memcpy(
-                ia_usm,, ia_host.size() * sizeof(intType), ev_spsv));
-            mat_dependencies.push_back(main_queue.memcpy(
-                ja_usm,, ja_host.size() * sizeof(intType), ev_spsv));
-            mat_dependencies.push_back(
+            dependencies.clear();
+            dependencies.push_back(main_queue.memcpy(ia_usm,,
+                                                     ia_host.size() * sizeof(intType), ev_spsv));
+            dependencies.push_back(main_queue.memcpy(ja_usm,,
+                                                     ja_host.size() * sizeof(intType), ev_spsv));
+            dependencies.push_back(
                 main_queue.memcpy(a_usm,, a_host.size() * sizeof(fpType), ev_spsv));
-            mat_dependencies.push_back(
+            dependencies.push_back(
                 main_queue.memcpy(y_usm,, y_host.size() * sizeof(fpType), ev_spsv));
             set_matrix_data(main_queue, format, A_handle, m, m, nnz, index, ia_usm, ja_usm, a_usm);
@@ -177,7 +168,7 @@ int test_spsv(sycl::device* dev, sparse_matrix_format_t format, intType m, doubl
             CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::spsv_optimize, main_queue, transpose_val,
                           alpha_host_or_usm_ptr, A_view, A_handle, x_handle, y_handle, alg, descr,
-                          workspace_usm.get(), mat_dependencies);
+                          workspace_usm.get(), dependencies);
             CALL_RT_OR_CT(ev_spsv = oneapi::mkl::sparse::spsv, main_queue, transpose_val,
                           alpha_host_or_usm_ptr, A_view, A_handle, x_handle, y_handle, alg, descr,

From bd5d47bd4be71fce665a5b97f40448cd5f6cc7a0 Mon Sep 17 00:00:00 2001
From: Romain Biessy <>
Date: Wed, 30 Oct 2024 18:21:53 +0100
Subject: [PATCH 2/2] [DFT] Reword comment (#607)

 src/dft/backends/mklgpu/commit.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dft/backends/mklgpu/commit.cpp b/src/dft/backends/mklgpu/commit.cpp
index bae1eb69f..8405c3891 100644
--- a/src/dft/backends/mklgpu/commit.cpp
+++ b/src/dft/backends/mklgpu/commit.cpp
@@ -43,7 +43,7 @@
 #include <oneapi/mkl/dft.hpp>
-// MKL 2024.1 deprecates input/output strides.
+// Intel oneMKL 2024.1 deprecates input/output strides.
 #if INTEL_MKL_VERSION < 20240001
 #error MKLGPU requires oneMKL 2024.1 or later