diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 04a1ba74b2..1646300e81 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -16,9 +16,11 @@ jobs: - name: Install Dependencies run: | brew install doxygen - python3 -m pip install sphinx -v "sphinx==6.2.1" - python3 -m pip install breathe - python3 -m pip install sphinx-rtd-theme + python3 -m venv .venv + . .venv/bin/activate + pip install sphinx -v "sphinx==6.2.1" + pip install breathe + pip install sphinx-rtd-theme sphinx-build --version doxygen --version @@ -52,8 +54,10 @@ jobs: working-directory: kokkos/build run: make -j2 install + # sphinx needs to be available at configure time for the target to be generated - name: configure_kokkos_kernels run: | + . .venv/bin/activate mkdir -p kokkos-kernels/{build,install} cd kokkos-kernels/build cmake \ @@ -81,5 +85,7 @@ jobs: fi - name: build_kokkos_kernels_sphinx - working-directory: kokkos-kernels/build - run: make Sphinx + run: | + . .venv/bin/activate + cd kokkos-kernels/build + make Sphinx diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 944807b032..9f05579fa5 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -58,7 +58,7 @@ jobs: uses: actions/checkout@v4 with: repository: kokkos/kokkos - ref: ${{ github.base_ref }} + ref: 4.3.00 path: kokkos - name: configure_kokkos diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bc9cb65a6..9cb40b5e74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Change Log +## [4.3.01](https://github.com/kokkos/kokkos-kernels/tree/4.3.01) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.3.00...4.3.01) + +### Bug Fixes: +- sparse: block spiluk fixes [\#2172](https://github.com/kokkos/kokkos-kernels/pull/2172) +- magma: tpl interaction fixes [\#2176](https://github.com/kokkos/kokkos-kernels/pull/2176), [\#2178](https://github.com/kokkos/kokkos-kernels/pull/2178), [\#2181](https://github.com/kokkos/kokkos-kernels/pull/2181) +- trsv: Add early return if numRows == 0 in trsv to avoid integer divide-by-zero error [\#2180](https://github.com/kokkos/kokkos-kernels/pull/2180) +- blas tpl: resolve potential duplicate symbol [\#2183](https://github.com/kokkos/kokkos-kernels/pull/2183) +- spmv: permformance fix, add back special path for rank-2 x/y with 1 column [\#2164](https://github.com/kokkos/kokkos-kernels/pull/2164), [\#2168](https://github.com/kokkos/kokkos-kernels/pull/2168) +- BsrMatrix: Fix HostMirror typedef [\#2196](https://github.com/kokkos/kokkos-kernels/pull/2196) +- GA: Fix macOS docs build [\#2190](https://github.com/kokkos/kokkos-kernels/pull/2190) + ## [4.3.00](https://github.com/kokkos/kokkos-kernels/tree/4.3.00) (2024-03-19) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.2.01...4.3.00) @@ -639,7 +651,7 @@ ## [3.6.00](https://github.com/kokkos/kokkos-kernels/tree/3.6.00) (2022-02-18) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.5.00...3.6.00) -### Features: +### Features: #### Batched Sparse Linear algebra - Kokkos Kernels is adding a new component to the library: batched sparse linear algebra. @@ -673,7 +685,7 @@ - SpMV: adding support for rocSPARSE TPL [\#1221](https://github.com/kokkos/kokkos-kernels/pull/1221) #### Additional new features -- bhalf: Unit test Batched GEMM [\#1251](https://github.com/kokkos/kokkos-kernels/pull/1251) +- bhalf: Unit test Batched GEMM [\#1251](https://github.com/kokkos/kokkos-kernels/pull/1251) - and demostrate GMRES example convergence with bhalf_t (https://github.com/kokkos/kokkos-kernels/pull/1300) - Stream interface: adding stream support in GEMV and GEMM [\#1131](https://github.com/kokkos/kokkos-kernels/pull/1131) - Improve double buffering batched gemm performance [\#1217](https://github.com/kokkos/kokkos-kernels/pull/1217) @@ -962,7 +974,7 @@ ## [3.1.01](https://github.com/kokkos/kokkos-kernels/tree/3.1.01) (2020-05-04) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.1.00...3.1.01) -** Fixed bugs:** +** Fixed bugs:** - KokkosBatched QR PR breaking nightly tests [\#691](https://github.com/kokkos/kokkos-kernels/issues/691) diff --git a/CMakeLists.txt b/CMakeLists.txt index bd3d761bdb..45e91a90f0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) SET(KokkosKernels_VERSION_MAJOR 4) SET(KokkosKernels_VERSION_MINOR 3) -SET(KokkosKernels_VERSION_PATCH 0) +SET(KokkosKernels_VERSION_PATCH 1) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") #Set variables for config file @@ -127,13 +127,13 @@ ELSE() IF (NOT KOKKOSKERNELS_HAS_TRILINOS AND NOT KOKKOSKERNELS_HAS_PARENT) # This is a standalone build FIND_PACKAGE(Kokkos REQUIRED) - IF((${Kokkos_VERSION} VERSION_GREATER_EQUAL "4.1.0") AND (${Kokkos_VERSION} VERSION_LESS_EQUAL "4.3.0")) + IF((${Kokkos_VERSION} VERSION_GREATER_EQUAL "4.1.0") AND (${Kokkos_VERSION} VERSION_LESS_EQUAL "4.3.1")) MESSAGE(STATUS "Found Kokkos version ${Kokkos_VERSION} at ${Kokkos_DIR}") IF((${Kokkos_VERSION} VERSION_GREATER "4.3.99")) MESSAGE(WARNING "Configuring with Kokkos ${Kokkos_VERSION} which is newer than the expected develop branch - version check may need update") ENDIF() ELSE() - MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires Kokkos_VERSION 4.1.0, 4.2.0, 4.2.1 or 4.3.0") + MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires Kokkos_VERSION 4.1.0, 4.2.0, 4.2.1, 4.3.0, or 4.3.1") ENDIF() ENDIF() diff --git a/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp index a7430775ea..9f5f857e44 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp @@ -99,6 +99,48 @@ struct TeamTrsm +struct TeamTrsm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return TeamTrsmInternalLeftUpper::invoke( + member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, + A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(), + B.stride_0()); + } +}; + +template +struct TeamTrsm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return TeamTrsmInternalLeftUpper::invoke( + member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, + A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(), + B.stride_0()); + } +}; + +/// +/// R/U/T +/// +/// B := (alpha*B) inv(triu(A)) +/// A(n x n), B(m x n) + template struct TeamTrsm { @@ -107,7 +149,7 @@ struct TeamTrsm::invoke( + return TeamTrsmInternalLeftUpper::invoke( member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0()); @@ -122,7 +164,7 @@ struct TeamTrsm::invoke( + return TeamTrsmInternalLeftUpper::invoke( member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0()); diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp index d57e671908..3c00b4f477 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp @@ -229,7 +229,11 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, ASSERT_EQ(batchedGemmHandleCublas.vecLen, 0); #endif -#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) + // FIXME temporary workaround to run this magma test only if cublas is not + // enabled the design of the BatchedGemmHandle currently does not allow + // simultanous testing in this way. See issue #2177 +#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) magma_queue_t magma_queue; BatchedGemmHandle batchedGemmHandleMagma(magma_queue, GemmTplAlgos::MAGMA, 0, 0); diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp index 8561675c72..f22e800bc5 100644 --- a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp @@ -163,7 +163,7 @@ namespace Impl { ETI_SPEC_AVAIL>::syr2(space, trans, uplo, alpha, X, Y, A); \ } else { \ if (A_is_ll) { \ - HostBlas>::zher2( \ + HostBlas>::her2( \ uplo[0], N, alpha, \ reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(Y.data()), one, \ @@ -220,7 +220,7 @@ namespace Impl { ETI_SPEC_AVAIL>::syr2(space, trans, uplo, alpha, X, Y, A); \ } else { \ if (A_is_ll) { \ - HostBlas>::cher2( \ + HostBlas>::her2( \ uplo[0], N, alpha, \ reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(Y.data()), one, \ diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp index 6b64fce2bc..fc8fb949d7 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -139,7 +139,7 @@ namespace Impl { space, trans, uplo, alpha, X, A); \ } else { \ if (A_is_ll) { \ - HostBlas>::zher( \ + HostBlas>::her( \ uplo[0], N, alpha.real(), \ reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(A.data()), LDA); \ @@ -188,7 +188,7 @@ namespace Impl { space, trans, uplo, alpha, X, A); \ } else { \ if (A_is_ll && (alpha.imag() == 0.)) { \ - HostBlas>::cher( \ + HostBlas>::her( \ uplo[0], N, alpha.real(), \ reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(A.data()), LDA); \ diff --git a/blas/tpls/KokkosBlas_Cuda_tpl.cpp b/blas/tpls/KokkosBlas_Cuda_tpl.cpp index eed90ef7e0..cb8ba34101 100644 --- a/blas/tpls/KokkosBlas_Cuda_tpl.cpp +++ b/blas/tpls/KokkosBlas_Cuda_tpl.cpp @@ -16,3 +16,4 @@ #include #include #include +#include diff --git a/blas/tpls/KokkosBlas_Cuda_tpl.hpp b/blas/tpls/KokkosBlas_Cuda_tpl.hpp index cf51341471..d85785316e 100644 --- a/blas/tpls/KokkosBlas_Cuda_tpl.hpp +++ b/blas/tpls/KokkosBlas_Cuda_tpl.hpp @@ -39,26 +39,4 @@ CudaBlasSingleton& CudaBlasSingleton::singleton() { } // namespace KokkosBlas #endif // defined (KOKKOSKERNELS_ENABLE_TPL_CUBLAS) -#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) -#include - -namespace KokkosBlas { -namespace Impl { - -MagmaSingleton::MagmaSingleton() { - magma_int_t stat = magma_init(); - if (stat != MAGMA_SUCCESS) Kokkos::abort("MAGMA initialization failed\n"); - - Kokkos::push_finalize_hook([&]() { magma_finalize(); }); -} - -MagmaSingleton& MagmaSingleton::singleton() { - static MagmaSingleton s; - return s; -} - -} // namespace Impl -} // namespace KokkosBlas -#endif // defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) - #endif // KOKKOSBLAS_CUDA_TPL_HPP_ diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index 50aab57c73..dc04ca7e67 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -295,10 +295,10 @@ void F77_BLAS_MANGLE(dsyr, DSYR)(const char*, KK_INT*, const double*, void F77_BLAS_MANGLE(cher, CHER)(const char*, KK_INT*, const float*, const std::complex*, KK_INT*, - std::complex*, KK_INT*); + /* */ std::complex*, KK_INT*); void F77_BLAS_MANGLE(zher, ZHER)(const char*, KK_INT*, const double*, const std::complex*, KK_INT*, - std::complex*, KK_INT*); + /* */ std::complex*, KK_INT*); /// /// Syr2 @@ -322,12 +322,12 @@ void F77_BLAS_MANGLE(cher2, CHER2)(const char*, KK_INT*, const std::complex*, const std::complex*, KK_INT*, const std::complex*, KK_INT*, - std::complex*, KK_INT*); + /* */ std::complex*, KK_INT*); void F77_BLAS_MANGLE(zher2, ZHER2)(const char*, KK_INT*, const std::complex*, const std::complex*, KK_INT*, const std::complex*, KK_INT*, - std::complex*, KK_INT*); + /* */ std::complex*, KK_INT*); /// /// Trsv @@ -901,14 +901,14 @@ void HostBlas >::gerc( } template <> template <> -void HostBlas >::cher( +void HostBlas >::her( const char uplo, KK_INT n, const float alpha, const std::complex* x, KK_INT incx, std::complex* a, KK_INT lda) { F77_FUNC_CHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, (std::complex*)a, &lda); } template <> -void HostBlas >::cher2( +void HostBlas >::her2( const char uplo, KK_INT n, const std::complex alpha, const std::complex* x, KK_INT incx, const std::complex* y, KK_INT incy, std::complex* a, KK_INT lda) { @@ -1069,15 +1069,17 @@ void HostBlas >::gerc( } template <> template <> -void HostBlas >::zher( - const char uplo, KK_INT n, const double alpha, - const std::complex* x, KK_INT incx, std::complex* a, - KK_INT lda) { +void HostBlas >::her(const char uplo, KK_INT n, + const double alpha, + const std::complex* x, + KK_INT incx, + std::complex* a, + KK_INT lda) { F77_FUNC_ZHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, (std::complex*)a, &lda); } template <> -void HostBlas >::zher2( +void HostBlas >::her2( const char uplo, KK_INT n, const std::complex alpha, const std::complex* x, KK_INT incx, const std::complex* y, KK_INT incy, std::complex* a, KK_INT lda) { diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index 5fb7c1f624..d28f7a2186 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -90,18 +90,11 @@ struct HostBlas { KK_INT incx, const T *y, KK_INT incy, T *a, KK_INT lda); template - static void cher(const char uplo, KK_INT n, const tAlpha alpha, const T *x, - KK_INT incx, T *a, KK_INT lda); - - template - static void zher(const char uplo, KK_INT n, const tAlpha alpha, const T *x, - KK_INT incx, T *a, KK_INT lda); - - static void cher2(const char uplo, KK_INT n, const T alpha, const T *x, - KK_INT incx, const T *y, KK_INT incy, T *a, KK_INT lda); + static void her(const char uplo, KK_INT n, const tAlpha alpha, const T *x, + KK_INT incx, T *a, KK_INT lda); - static void zher2(const char uplo, KK_INT n, const T alpha, const T *x, - KK_INT incx, const T *y, KK_INT incy, T *a, KK_INT lda); + static void her2(const char uplo, KK_INT n, const T alpha, const T *x, + KK_INT incx, const T *y, KK_INT incy, T *a, KK_INT lda); static void trsv(const char uplo, const char transa, const char diag, KK_INT m, const T *a, KK_INT lda, diff --git a/blas/tpls/KokkosBlas_Magma_tpl.hpp b/blas/tpls/KokkosBlas_Magma_tpl.hpp new file mode 100644 index 0000000000..f149a790df --- /dev/null +++ b/blas/tpls/KokkosBlas_Magma_tpl.hpp @@ -0,0 +1,41 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBLAS_MAGMA_TPL_HPP_ +#define KOKKOSBLAS_MAGMA_TPL_HPP_ + +#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) +#include + +namespace KokkosBlas { +namespace Impl { + +MagmaSingleton::MagmaSingleton() { + magma_int_t stat = magma_init(); + if (stat != MAGMA_SUCCESS) Kokkos::abort("MAGMA initialization failed\n"); + + Kokkos::push_finalize_hook([&]() { magma_finalize(); }); +} + +MagmaSingleton& MagmaSingleton::singleton() { + static MagmaSingleton s; + return s; +} + +} // namespace Impl +} // namespace KokkosBlas +#endif // defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) + +#endif // KOKKOSBLAS_MAGMA_TPL_HPP_ diff --git a/blas/tpls/KokkosBlas_magma.hpp b/blas/tpls/KokkosBlas_magma.hpp new file mode 100644 index 0000000000..5f5fcfe4e1 --- /dev/null +++ b/blas/tpls/KokkosBlas_magma.hpp @@ -0,0 +1,37 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS_MAGMA_HPP_ +#define KOKKOSBLAS_MAGMA_HPP_ + +// If LAPACK TPL is enabled, it is preferred over magma's LAPACK +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#include "magma_v2.h" + +namespace KokkosBlas { +namespace Impl { + +struct MagmaSingleton { + MagmaSingleton(); + + static MagmaSingleton& singleton(); +}; + +} // namespace Impl +} // namespace KokkosBlas +#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA + +#endif // KOKKOSBLAS_MAGMA_HPP_ diff --git a/blas/tpls/KokkosBlas_tpl_spec.hpp b/blas/tpls/KokkosBlas_tpl_spec.hpp index a1eee4b69c..0151c0534f 100644 --- a/blas/tpls/KokkosBlas_tpl_spec.hpp +++ b/blas/tpls/KokkosBlas_tpl_spec.hpp @@ -214,21 +214,4 @@ inline rocblas_operation trans_mode_kk_to_rocblas(const char kkMode[]) { #endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -// If LAPACK TPL is enabled, it is preferred over magma's LAPACK -#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#include "magma_v2.h" - -namespace KokkosBlas { -namespace Impl { - -struct MagmaSingleton { - MagmaSingleton(); - - static MagmaSingleton& singleton(); -}; - -} // namespace Impl -} // namespace KokkosBlas -#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA - #endif // KOKKOSBLAS_TPL_SPEC_HPP_ diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index f825a2184a..804a2b7542 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -34,6 +34,12 @@ IF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) ) ENDIF() +IF (KOKKOSKERNELS_ENABLE_TPL_MAGMA) + LIST(APPEND SOURCES + lapack/tpls/KokkosLapack_Magma_tpl.cpp + ) +ENDIF() + ################## # # # ETI generation # diff --git a/lapack/tpls/KokkosLapack_Cuda_tpl.hpp b/lapack/tpls/KokkosLapack_Cuda_tpl.hpp index 6749a4740f..943d10d111 100644 --- a/lapack/tpls/KokkosLapack_Cuda_tpl.hpp +++ b/lapack/tpls/KokkosLapack_Cuda_tpl.hpp @@ -39,26 +39,4 @@ CudaLapackSingleton& CudaLapackSingleton::singleton() { } // namespace KokkosLapack #endif // defined (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) -#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) -#include - -namespace KokkosLapack { -namespace Impl { - -MagmaSingleton::MagmaSingleton() { - magma_int_t stat = magma_init(); - if (stat != MAGMA_SUCCESS) Kokkos::abort("MAGMA initialization failed\n"); - - Kokkos::push_finalize_hook([&]() { magma_finalize(); }); -} - -MagmaSingleton& MagmaSingleton::singleton() { - static MagmaSingleton s; - return s; -} - -} // namespace Impl -} // namespace KokkosLapack -#endif // defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) - #endif // KOKKOSLAPACK_CUDA_TPL_HPP_ diff --git a/lapack/tpls/KokkosLapack_Magma_tpl.cpp b/lapack/tpls/KokkosLapack_Magma_tpl.cpp new file mode 100644 index 0000000000..73add8d9e0 --- /dev/null +++ b/lapack/tpls/KokkosLapack_Magma_tpl.cpp @@ -0,0 +1,18 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#include +#include +#include diff --git a/lapack/tpls/KokkosLapack_Magma_tpl.hpp b/lapack/tpls/KokkosLapack_Magma_tpl.hpp new file mode 100644 index 0000000000..636c40735d --- /dev/null +++ b/lapack/tpls/KokkosLapack_Magma_tpl.hpp @@ -0,0 +1,41 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSLAPACK_MAGMA_TPL_HPP_ +#define KOKKOSLAPACK_MAGMA_TPL_HPP_ + +#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) +#include + +namespace KokkosLapack { +namespace Impl { + +MagmaSingleton::MagmaSingleton() { + magma_int_t stat = magma_init(); + if (stat != MAGMA_SUCCESS) Kokkos::abort("MAGMA initialization failed\n"); + + Kokkos::push_finalize_hook([&]() { magma_finalize(); }); +} + +MagmaSingleton& MagmaSingleton::singleton() { + static MagmaSingleton s; + return s; +} + +} // namespace Impl +} // namespace KokkosLapack +#endif // defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) + +#endif // KOKKOSLAPACK_MAGMA_TPL_HPP_ diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp index 41592e079a..ca4b9e7abc 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp @@ -155,7 +155,7 @@ KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, // MAGMA #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#include +#include namespace KokkosLapack { namespace Impl { diff --git a/master_history.txt b/master_history.txt index 2207bca133..3e8f8fcbd8 100644 --- a/master_history.txt +++ b/master_history.txt @@ -25,3 +25,4 @@ tag: 4.1.00 date: 06/20/2023 master: 1331baf1 release: 14ad220a tag: 4.2.00 date: 11/09/2023 master: 25a31f88 release: 912d3778 tag: 4.2.01 date: 01/30/2024 master: f429f6ec release: bcf9854b tag: 4.3.00 date: 04/03/2024 master: afd65f03 release: ebbf4b78 +tag: 4.3.01 date: 05/07/2024 master: 1b0a15f5 release: 58785c1b diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index b3b5dfa277..415ccf87a0 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -32,6 +32,9 @@ #include "KokkosBatched_Gemm_Decl.hpp" #include "KokkosBatched_Gemm_Serial_Impl.hpp" #include "KokkosBlas1_set.hpp" +#include "KokkosBatched_LU_Decl.hpp" +#include "KokkosBatched_Trmm_Decl.hpp" +#include "KokkosBatched_Trmm_Serial_Impl.hpp" //#define NUMERIC_OUTPUT_INFO @@ -107,6 +110,17 @@ struct IlukWrap { lno_t lev_start; using reftype = scalar_t &; + using valtype = scalar_t; + + static constexpr size_type BUFF_SIZE = 1; + + struct SBlock { + template + KOKKOS_INLINE_FUNCTION SBlock(T, size_type, size_type) {} + + KOKKOS_INLINE_FUNCTION + scalar_t *data() { return nullptr; } + }; Common(const ARowMapType &A_row_map_, const AEntriesType &A_entries_, const AValuesType &A_values_, const LRowMapType &L_row_map_, @@ -131,6 +145,9 @@ struct IlukWrap { "Tried to use blocks with the unblocked Common?"); } + KOKKOS_INLINE_FUNCTION + size_type get_block_size() const { return 0; } + // lset KOKKOS_INLINE_FUNCTION void lset(const size_type nnz, const scalar_t &value) const { @@ -154,12 +171,18 @@ struct IlukWrap { // divide. lhs /= rhs KOKKOS_INLINE_FUNCTION - void divide(const member_type &team, scalar_t &lhs, - const scalar_t &rhs) const { + void divide(const member_type &team, scalar_t &lhs, const scalar_t &rhs, + scalar_t *) const { Kokkos::single(Kokkos::PerTeam(team), [&]() { lhs /= rhs; }); team.team_barrier(); } + // divide_left. lhs /= rhs + KOKKOS_INLINE_FUNCTION + void divide_left(scalar_t &lhs, const scalar_t &rhs, scalar_t *) const { + lhs /= rhs; + } + // multiply_subtract. C -= A * B KOKKOS_INLINE_FUNCTION void multiply_subtract(const scalar_t &A, const scalar_t &B, @@ -171,6 +194,18 @@ struct IlukWrap { KOKKOS_INLINE_FUNCTION scalar_t &lget(const size_type nnz) const { return L_values(nnz); } + // lcopy + KOKKOS_INLINE_FUNCTION + scalar_t lcopy(const size_type nnz, scalar_t *) const { + return L_values(nnz); + } + + // ucopy + KOKKOS_INLINE_FUNCTION + scalar_t ucopy(const size_type nnz, scalar_t *) const { + return U_values(nnz); + } + // uget KOKKOS_INLINE_FUNCTION scalar_t &uget(const size_type nnz) const { return U_values(nnz); } @@ -188,6 +223,12 @@ struct IlukWrap { // print KOKKOS_INLINE_FUNCTION void print(const scalar_t &item) const { std::cout << item << std::endl; } + + // report + KOKKOS_INLINE_FUNCTION + void report() const { + std::cout << "JGF using unblocked version" << std::endl; + } }; // Partial specialization for block support @@ -197,6 +238,30 @@ struct IlukWrap { struct Common { + // BSR data is in LayoutRight! + using Layout = Kokkos::LayoutRight; + using value_type = typename LValuesType::value_type; + using cvalue_type = typename LValuesType::const_value_type; + + using Block = Kokkos::View< + value_type **, Layout, typename LValuesType::device_type, + Kokkos::MemoryTraits >; + + // const block + using CBlock = Kokkos::View< + cvalue_type **, Layout, typename UValuesType::device_type, + Kokkos::MemoryTraits >; + + // scratch block + using SBlock = Kokkos::View< + value_type **, Layout, typename execution_space::scratch_memory_space, + Kokkos::MemoryTraits >; + + using reftype = Block; + using valtype = Block; + + static constexpr size_type BUFF_SIZE = 128; + ARowMapType A_row_map; AEntriesType A_entries; AValuesType A_values; @@ -212,26 +277,6 @@ struct IlukWrap { size_type block_size; size_type block_items; - // BSR data is in LayoutRight! - using Layout = Kokkos::LayoutRight; - - using LBlock = Kokkos::View< - typename LValuesType::value_type **, Layout, - typename LValuesType::device_type, - Kokkos::MemoryTraits >; - - using UBlock = Kokkos::View< - typename UValuesType::value_type **, Layout, - typename UValuesType::device_type, - Kokkos::MemoryTraits >; - - using ABlock = Kokkos::View< - typename AValuesType::value_type **, Layout, - typename AValuesType::device_type, - Kokkos::MemoryTraits >; - - using reftype = LBlock; - Common(const ARowMapType &A_row_map_, const AEntriesType &A_entries_, const AValuesType &A_values_, const LRowMapType &L_row_map_, const LEntriesType &L_entries_, LValuesType &L_values_, @@ -255,8 +300,12 @@ struct IlukWrap { block_items(block_size * block_size) { KK_REQUIRE_MSG(block_size > 0, "Tried to use block_size=0 with the blocked Common?"); + KK_REQUIRE_MSG(block_size <= 11, "Max supported block size is 11"); } + KOKKOS_INLINE_FUNCTION + size_type get_block_size() const { return block_size; } + // lset KOKKOS_INLINE_FUNCTION void lset(const size_type block, const scalar_t &value) const { @@ -264,13 +313,9 @@ struct IlukWrap { } KOKKOS_INLINE_FUNCTION - void lset(const size_type block, const ABlock &rhs) const { + void lset(const size_type block, const CBlock &rhs) const { auto lblock = lget(block); - for (size_type i = 0; i < block_size; ++i) { - for (size_type j = 0; j < block_size; ++j) { - lblock(i, j) = rhs(i, j); - } - } + assign(lblock, rhs); } // uset @@ -280,13 +325,9 @@ struct IlukWrap { } KOKKOS_INLINE_FUNCTION - void uset(const size_type block, const ABlock &rhs) const { + void uset(const size_type block, const CBlock &rhs) const { auto ublock = uget(block); - for (size_type i = 0; i < block_size; ++i) { - for (size_type j = 0; j < block_size; ++j) { - ublock(i, j) = rhs(i, j); - } - } + assign(ublock, rhs); } // lset_id @@ -295,49 +336,111 @@ struct IlukWrap { KokkosBatched::TeamSetIdentity::invoke(team, lget(block)); } - // divide. lhs /= rhs + // assign + template + KOKKOS_INLINE_FUNCTION void assign(const ViewT &lhs, + const CBlock &rhs) const { + for (size_type i = 0; i < block_size; ++i) { + for (size_type j = 0; j < block_size; ++j) { + lhs(i, j) = rhs(i, j); + } + } + } + + // divide. lhs /= rhs (lhs = lhs * rhs^-1) KOKKOS_INLINE_FUNCTION - void divide(const member_type &team, const LBlock &lhs, - const UBlock &rhs) const { + void divide(const member_type &team, const Block &lhs, const CBlock &rhs, + scalar_t *buff) const { + // Need a temp block to do LU of rhs + Block LU(buff, block_size, block_size); + assign(LU, rhs); + KokkosBatched::TeamLU::invoke(team, LU); + + // rhs = LU + // rhs^-1 = U^-1 * L^-1 + // lhs = (lhs * U^-1) * L^-1, so do U trsm first KokkosBatched::TeamTrsm< member_type, KokkosBatched::Side::Right, KokkosBatched::Uplo::Upper, - KokkosBatched::Trans::NoTranspose, // not 100% on this - KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsm::Unblocked>:: // not 100% on this - invoke(team, 1.0, rhs, lhs); + KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, + KokkosBatched::Algo::Trsm::Blocked>::invoke(team, 1.0, LU, lhs); + + KokkosBatched::TeamTrsm< + member_type, KokkosBatched::Side::Right, KokkosBatched::Uplo::Lower, + KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, + KokkosBatched::Algo::Trsm::Blocked>::invoke(team, 1.0, LU, lhs); + } + + // divide_left. lhs /= rhs (lhs = rhs^-1 * lhs) + KOKKOS_INLINE_FUNCTION + void divide_left(const Block &lhs, const CBlock &rhs, + scalar_t *buff) const { + Block LU(buff, block_size, block_size); + assign(LU, rhs); + KokkosBatched::SerialLU::invoke(LU); + + // rhs = LU + // rhs^-1 = U^-1 * L^-1 + // lhs = U^-1 * (L^-1 * lhs), so do L trsm first + KokkosBatched::SerialTrsm< + KokkosBatched::Side::Left, KokkosBatched::Uplo::Lower, + KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, + KokkosBatched::Algo::Trsm::Blocked>::invoke(1.0, LU, lhs); + + KokkosBatched::SerialTrsm< + KokkosBatched::Side::Left, KokkosBatched::Uplo::Upper, + KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, + KokkosBatched::Algo::Trsm::Blocked>::invoke(1.0, LU, lhs); } // multiply_subtract. C -= A * B - template - KOKKOS_INLINE_FUNCTION void multiply_subtract(const UBlock &A, - const LBlock &B, - CView &C) const { + KOKKOS_INLINE_FUNCTION + void multiply_subtract(const CBlock &A, const CBlock &B, + const Block &C) const { // Use gemm. alpha is hardcoded to -1, beta hardcoded to 1 KokkosBatched::SerialGemm< KokkosBatched::Trans::NoTranspose, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Algo::Gemm::Unblocked>::invoke( - -1.0, A, B, 1.0, C); + KokkosBatched::Algo::Gemm::Blocked>::invoke(-1.0, A, B, 1.0, + C); } // lget KOKKOS_INLINE_FUNCTION - LBlock lget(const size_type block) const { - return LBlock(L_values.data() + (block * block_items), block_size, - block_size); + Block lget(const size_type block) const { + return Block(L_values.data() + (block * block_items), block_size, + block_size); + } + + // lcopy + KOKKOS_INLINE_FUNCTION + Block lcopy(const size_type block, scalar_t *buff) const { + Block result(buff, block_size, block_size); + auto lblock = lget(block); + assign(result, lblock); + return result; + } + + // ucopy + KOKKOS_INLINE_FUNCTION + Block ucopy(const size_type block, scalar_t *buff) const { + Block result(buff, block_size, block_size); + auto ublock = uget(block); + assign(result, ublock); + return result; } // uget KOKKOS_INLINE_FUNCTION - UBlock uget(const size_type block) const { - return UBlock(U_values.data() + (block * block_items), block_size, - block_size); + Block uget(const size_type block) const { + return Block(U_values.data() + (block * block_items), block_size, + block_size); } // aget KOKKOS_INLINE_FUNCTION - ABlock aget(const size_type block) const { - return ABlock(A_values.data() + (block * block_items), block_size, + CBlock aget(const size_type block) const { + return CBlock(A_values.data() + (block * block_items), block_size, block_size); } @@ -357,7 +460,7 @@ struct IlukWrap { // print KOKKOS_INLINE_FUNCTION - void print(const LBlock &item) const { + void print(const CBlock &item) const { for (size_type i = 0; i < block_size; ++i) { std::cout << " "; for (size_type j = 0; j < block_size; ++j) { @@ -366,6 +469,13 @@ struct IlukWrap { std::cout << std::endl; } } + + // report + KOKKOS_INLINE_FUNCTION + void report() const { + std::cout << "JGF using blocked version with block_size=" << block_size + << std::endl; + } }; template + typedef BsrMatrix HostMirror; //! Type of the graph structure of the sparse matrix. typedef Kokkos::StaticCrsGraph +inline constexpr bool spmv_general_tpl_avail() { + constexpr bool isBSR = ::KokkosSparse::Experimental::is_bsr_matrix_v; + if constexpr (!isBSR) { + // CRS + if constexpr (XVector::rank() == 1) + return spmv_tpl_spec_avail::value; + else + return spmv_mv_tpl_spec_avail::value; + } else { + // BSR + if constexpr (XVector::rank() == 1) + return spmv_bsrmatrix_tpl_spec_avail::value; + else + return spmv_mv_bsrmatrix_tpl_spec_avail::value; + } +} +} // namespace Impl + // clang-format off /// \brief Kokkos sparse matrix-vector multiply. /// Computes y := alpha*Op(A)*x + beta*y, where Op(A) is @@ -221,6 +246,35 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename YVector::device_type, Kokkos::MemoryTraits>; + // Special case: XVector/YVector are rank-2 but x,y both have one column and + // are contiguous. If a TPL is available for rank-1 vectors but not rank-2, + // take rank-1 subviews of x,y and call the rank-1 version. + if constexpr (XVector::rank() == 2) { + using XVector_SubInternal = Kokkos::View< + typename XVector::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename XVector::device_type, + Kokkos::MemoryTraits>; + using YVector_SubInternal = Kokkos::View< + typename YVector::non_const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename YVector::device_type, Kokkos::MemoryTraits>; + if constexpr (!Impl::spmv_general_tpl_avail< + ExecutionSpace, HandleImpl, AMatrix_Internal, + XVector_Internal, YVector_Internal>() && + Impl::spmv_general_tpl_avail< + ExecutionSpace, HandleImpl, AMatrix_Internal, + XVector_SubInternal, YVector_SubInternal>()) { + if (x.extent(1) == size_t(1) && x.span_is_contiguous() && + y.span_is_contiguous()) { + XVector_SubInternal xsub(x.data(), x.extent(0)); + YVector_SubInternal ysub(y.data(), y.extent(0)); + spmv(space, handle->get_impl(), mode, alpha, A, xsub, beta, ysub); + return; + } + } + } + XVector_Internal x_i(x); YVector_Internal y_i(y); diff --git a/sparse/src/KokkosSparse_spmv_handle.hpp b/sparse/src/KokkosSparse_spmv_handle.hpp index 9e7295c72c..a2eecfd1ce 100644 --- a/sparse/src/KokkosSparse_spmv_handle.hpp +++ b/sparse/src/KokkosSparse_spmv_handle.hpp @@ -237,9 +237,8 @@ struct SPMVHandleImpl { ~SPMVHandleImpl() { if (tpl) delete tpl; } - void set_exec_space(const ExecutionSpace& exec) { - if (tpl) tpl->set_exec_space(exec); - } + + ImplType* get_impl() { return this; } /// Get the SPMVAlgorithm used by this handle SPMVAlgorithm get_algorithm() const { return this->algo; } diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 9c844ff910..e867038842 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -48,6 +48,8 @@ inline void spmv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for MKL BSR"); + // note: classic mkl only runs on synchronous host exec spaces, so no need + // to call set_exec_space on the subhandle here } else { // Use the default execution space instance, as classic MKL does not use // a specific instance. @@ -127,6 +129,8 @@ inline void spmv_mv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for MKL BSR"); + // note: classic mkl only runs on synchronous host exec spaces, so no need + // to call set_exec_space on the subhandle here } else { // Use the default execution space instance, as classic MKL does not use // a specific instance. @@ -392,6 +396,7 @@ void spmv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for cusparse"); + subhandle->set_exec_space(exec); } else { /* create and set the subhandle and matrix descriptor */ subhandle = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec); @@ -519,6 +524,7 @@ void spmv_mv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for cusparse"); + subhandle->set_exec_space(exec); } else { /* create and set the subhandle and matrix descriptor */ subhandle = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec); @@ -886,6 +892,7 @@ void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle, if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for rocsparse BSR"); + subhandle->set_exec_space(exec); } else { subhandle = new KokkosSparse::Impl::RocSparse_BSR_SpMV_Data(exec); handle->tpl = subhandle; diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp index 88fef4421a..44a8098ca3 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp @@ -29,6 +29,7 @@ struct spmv_mv_tpl_spec_avail { enum : bool { value = false }; }; +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #define KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(SCALAR, ORDINAL, OFFSET, \ XL, YL, MEMSPACE) \ template <> \ @@ -152,6 +153,7 @@ KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::Experimental::half_t, int, #endif #endif // defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) +#endif } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index 47b7d47f8e..2ae6bf44f2 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -192,6 +192,7 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, Handle *handle, if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for cusparse"); + subhandle->set_exec_space(exec); } else { subhandle = new KokkosSparse::Impl::CuSparse10_SpMV_Data(exec); handle->tpl = subhandle; diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 926d201a52..e3f88e6e11 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -111,6 +111,7 @@ void spmv_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[], if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for cusparse"); + subhandle->set_exec_space(exec); } else { subhandle = new KokkosSparse::Impl::CuSparse10_SpMV_Data(exec); handle->tpl = subhandle; @@ -155,6 +156,7 @@ void spmv_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[], if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for cusparse"); + subhandle->set_exec_space(exec); } else { /* create and set the subhandle and matrix descriptor */ subhandle = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec); @@ -423,6 +425,7 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for rocsparse CRS"); + subhandle->set_exec_space(exec); } else { subhandle = new KokkosSparse::Impl::RocSparse_CRS_SpMV_Data(exec); handle->tpl = subhandle; @@ -594,6 +597,8 @@ inline void spmv_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for MKL CRS"); + // note: classic mkl only runs on synchronous host exec spaces, so no need + // to call set_exec_space on the subhandle here } else { // Use the default execution space instance, as classic MKL does not use // a specific instance. @@ -757,6 +762,7 @@ inline void spmv_onemkl(const execution_space& exec, Handle* handle, if (!subhandle) throw std::runtime_error( "KokkosSparse::spmv: subhandle is not set up for OneMKL CRS"); + subhandle->set_exec_space(exec); } else { subhandle = new OneMKL_SpMV_Data(exec); handle->tpl = subhandle; diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 2a8398ed46..9eaf087c9b 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -121,8 +121,6 @@ MatrixType make_matrix(const char* name, const RowMapType& row_map, entries, block_size); } -static constexpr double EPS = 1e-7; - template struct SpilukTest { @@ -130,6 +128,7 @@ struct SpilukTest { using EntriesType = Kokkos::View; using ValuesType = Kokkos::View; using AT = Kokkos::ArithTraits; + using mag_t = typename Kokkos::ArithTraits::mag_type; using RowMapType_hostmirror = typename RowMapType::HostMirror; using EntriesType_hostmirror = typename EntriesType::HostMirror; @@ -138,6 +137,9 @@ struct SpilukTest { using memory_space = typename device::memory_space; using range_policy = Kokkos::RangePolicy; + static constexpr double EPS = + std::is_same::value ? 1e-7 : 1e-4; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; @@ -243,11 +245,7 @@ struct SpilukTest { } if (fill_lev > 1) { - if (UseBlocks) { - EXPECT_LT(result, 1e-2); - } else { - EXPECT_LT(result, 1e-4); - } + EXPECT_LT(result, 1e-4); } }