diff --git a/.gitignore b/.gitignore index fa032cb2cb..d64726e92e 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,8 @@ .project *.o TAGS + +#Clangd indexing +compile_commands.json +.cache/ +.vscode/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 76b4117118..ac3b708fb2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,31 @@ # Change Log -## [4.0.0](https://github.com/kokkos/kokkos-kernels/tree/4.0.0) (2023-21-02) -[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.7.01...4.0.0) +## [4.0.01](https://github.com/kokkos/kokkos-kernels/tree/4.0.01) (2023-04-19) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.00...4.0.01) + +### Bug Fixes: +- Use the options ENABLE_PERFTEST, ENABLE_EXAMPLES [\#1667](https://github.com/kokkos/kokkos-kernels/pull/1667) +- Introduce KOKKOSKERNELS_ALL_COMPONENTS_ENABLED variable [\#1691](https://github.com/kokkos/kokkos-kernels/pull/1691) +- Kokkos Kernels version: need to use upper case variables [\#1707](https://github.com/kokkos/kokkos-kernels/pull/1707) +- CUSPARSE_MM_ALG_DEFAULT deprecated by cuSparse 11.1 [\#1698](https://github.com/kokkos/kokkos-kernels/pull/1698) +- blas1: Fix a couple documentation typos [\#1704](https://github.com/kokkos/kokkos-kernels/pull/1704) +- CUDA 11.4: fixing some -Werror [\#1727](https://github.com/kokkos/kokkos-kernels/pull/1727) +- Remove unused variable in KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp [\#1734](https://github.com/kokkos/kokkos-kernels/pull/1734) +- Reduce BatchedGemm test coverage time [\#1737](https://github.com/kokkos/kokkos-kernels/pull/1737) +- Fix kk_generate_diagonally_dominant_sparse_matrix hang [\#1689](https://github.com/kokkos/kokkos-kernels/pull/1689) +- Temporary spgemm workaround matching Trilinos 11663 [\#1757](https://github.com/kokkos/kokkos-kernels/pull/1757) +- MDF: Minor changes to interface for ifpack2 impl [\#1759](https://github.com/kokkos/kokkos-kernels/pull/1759) +- Rocm TPL support upgrade [\#1763](https://github.com/kokkos/kokkos-kernels/pull/1763) +- Fix BLAS cmake check for complex types [\#1762](https://github.com/kokkos/kokkos-kernels/pull/1762) +- ParIlut: Adds a better parilut test with gmres [\#1661](https://github.com/kokkos/kokkos-kernels/pull/1661) +- GMRES: fixing some type issues related to memory space instantiation (partial) [\#1719](https://github.com/kokkos/kokkos-kernels/pull/1719) +- ParIlut: create and destroy spgemm handle for each usage [\#1736](https://github.com/kokkos/kokkos-kernels/pull/1736) +- ParIlut: remove par ilut limitations [\#1755](https://github.com/kokkos/kokkos-kernels/pull/1755) +- ParIlut: make Ut_values view atomic in compute_l_u_factors [\#1781](https://github.com/kokkos/kokkos-kernels/pull/1781) + + +## [4.0.0](https://github.com/kokkos/kokkos-kernels/tree/4.0.00) (2023-21-02) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.7.01...4.0.00) ### Features: - Copyright update 4.0 [\#1657](https://github.com/kokkos/kokkos-kernels/pull/1657) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8166d846fe..19c66c0f64 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) SET(KokkosKernels_VERSION_MAJOR 4) SET(KokkosKernels_VERSION_MINOR 0) -SET(KokkosKernels_VERSION_PATCH 0) +SET(KokkosKernels_VERSION_PATCH 1) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") #Set variables for config file @@ -70,7 +70,7 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS) ) KOKKOSKERNELS_ADD_OPTION( "ENABLE_PERFTESTS" - ON + OFF BOOL "Whether to build performance tests. Default: OFF" ) @@ -221,7 +221,7 @@ ELSE() # ================================================================== MESSAGE("") MESSAGE("================================") - MESSAGE("Kokkos Kernels version: ${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") + MESSAGE("Kokkos Kernels version: ${KOKKOSKERNELS_VERSION_MAJOR}.${KOKKOSKERNELS_VERSION_MINOR}.${KOKKOSKERNELS_VERSION_PATCH}") MESSAGE("================================") MESSAGE("Kokkos Kernels ETI Types") MESSAGE(" Devices: ${DEVICE_LIST}") @@ -390,10 +390,25 @@ ELSE() IF (KokkosKernels_ENABLE_COMPONENT_SPARSE) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(sparse/unit_test) ENDIF() - IF (KokkosKernels_ENABLE_ALL_COMPONENTS) - KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) - KOKKOSKERNELS_ADD_EXAMPLE_DIRECTORIES(example) - ENDIF() + IF (KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) + IF (KokkosKernels_ENABLE_PERFTESTS) + MESSAGE(STATUS "Enabling perf tests.") + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) + ENDIF () + IF (KokkosKernels_ENABLE_EXAMPLES) + MESSAGE(STATUS "Enabling examples.") + KOKKOSKERNELS_ADD_EXAMPLE_DIRECTORIES(example) + ENDIF () + ELSE () + # all components were not enabled, so perftests and examples can't be enabled. + # Warn if they were requested. + IF (KokkosKernels_ENABLE_PERFTESTS) + MESSAGE(WARNING "Could not enable perf tests because not all components were enabled") + ENDIF () + IF (KokkosKernels_ENABLE_EXAMPLES) + MESSAGE(WARNING "Could not enable examples because not all components were enabled") + ENDIF () + ENDIF () KOKKOSKERNELS_PACKAGE_POSTPROCESS() IF (KokkosKernels_ENABLE_DOCS) diff --git a/CheckHostBlasReturnComplex.cmake b/CheckHostBlasReturnComplex.cmake index 30063b1cc3..b9528ce45a 100644 --- a/CheckHostBlasReturnComplex.cmake +++ b/CheckHostBlasReturnComplex.cmake @@ -20,8 +20,8 @@ FUNCTION(CHECK_HOST_BLAS_RETURN_COMPLEX VARNAME) #define F77_BLAS_MANGLE${F77_BLAS_MANGLE} extern \"C\" { - std::complex F77_BLAS_MANGLE(zdotc,ZDOTC)( - const int* n, + void F77_BLAS_MANGLE(zdotc,ZDOTC)( + std::complex* result, const int* n, const std::complex x[], const int* incx, const std::complex y[], const int* incy); } @@ -35,13 +35,23 @@ int main() { TWO = std::complex(0.0,2.0); f[0] = ONE; f[1] = TWO; - std::complex ret - = F77_BLAS_MANGLE(zdotc,ZDOTC)(&NUM, f, &INC, f, &INC); + std::complex ret; + F77_BLAS_MANGLE(zdotc,ZDOTC)(&ret, &NUM, f, &INC, f, &INC); return (ret.real() == double(5.0) ? 0 : 1); } " ) - CHECK_CXX_SOURCE_RUNS("${SOURCE}" ${VARNAME}) +# Test whether the above program, which assumes BLAS can give back complex results +# via pointer arguments, compiles and runs correctly. +# If it does, assume that we don't need to get complex results as direct return values, +# which causes -Wreturn-type-c-linkage warnings. +CHECK_CXX_SOURCE_RUNS("${SOURCE}" KK_BLAS_RESULT_AS_POINTER_ARG) + +IF(${KK_BLAS_RESULT_AS_POINTER_ARG}) + SET(VARNAME OFF) +ELSE() + SET(VARNAME ON) +ENDIF() ENDFUNCTION() diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp index 746be07eb7..c4e09d6e68 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp @@ -257,11 +257,15 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, ASSERT_EQ(batchedGemmHandle.get_kernel_algo_type(), algo_type); - if (algo_type == BaseHeuristicAlgos::SQUARE || - algo_type == BaseTplAlgos::ARMPL || + if (algo_type == BaseTplAlgos::ARMPL || algo_type == BaseKokkosBatchedAlgos::KK_SERIAL || algo_type == GemmKokkosBatchedAlgos::KK_SERIAL_RANK0 || algo_type == GemmKokkosBatchedAlgos::KK_DBLBUF) { + impl_test_batched_gemm_with_handle( + &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, + matCdim1, matCdim2, 1.5, 3.0); + } else if (algo_type == BaseHeuristicAlgos::SQUARE) { // Invoke 4 times to ensure we cover all paths for alpha and beta impl_test_batched_gemm_with_handle( @@ -316,13 +320,12 @@ template void test_batched_gemm_with_layout(int N) { // Square cases - for (int i = 0; i < 5; ++i) { + { + int i = 0; Test::impl_test_batched_gemm(N, i, i, i, i, i, i); - } - { - int i = 10; + i = 10; Test::impl_test_batched_gemm(N, i, i, i, i, i, i); @@ -336,7 +339,7 @@ void test_batched_gemm_with_layout(int N) { } // Non-square cases - for (int i = 0; i < 5; ++i) { + for (int i = 1; i < 5; ++i) { int dimM = 1 * i; int dimN = 2 * i; int dimK = 3 * i; diff --git a/blas/src/KokkosBlas1_mult.hpp b/blas/src/KokkosBlas1_mult.hpp index 39ccbbeebd..e08409e9aa 100644 --- a/blas/src/KokkosBlas1_mult.hpp +++ b/blas/src/KokkosBlas1_mult.hpp @@ -23,6 +23,20 @@ namespace KokkosBlas { +/// \brief Element wise multiplication of two vectors: +/// Y[i] = gamma * Y[i] + alpha * A[i] * X[i] +/// +/// \tparam YMV Type of the first vector Y; a 1-D or 2-D Kokkos::View. +/// \tparam AV Type of the second vector A; a 1-D Kokkos::View. +/// \tparam XMV Type of the third vector X; a 1-D or 2-D Kokkos::View. +/// +/// \param gamma [in] The scalar to apply to Y. +/// \param Y [in/out] The Y vector. +/// \param alpha [in] The scalar to apply to A. +/// \param A [in] The vector to apply to X. +/// \param X [in] The X vector. +/// +/// \return Y = gamma * Y + alpha * A * X. template void mult(typename YMV::const_value_type& gamma, const YMV& Y, typename AV::const_value_type& alpha, const AV& A, const XMV& X) { diff --git a/blas/src/KokkosBlas1_nrm2w.hpp b/blas/src/KokkosBlas1_nrm2w.hpp index 5a7a07001c..403d8ba685 100644 --- a/blas/src/KokkosBlas1_nrm2w.hpp +++ b/blas/src/KokkosBlas1_nrm2w.hpp @@ -65,7 +65,7 @@ nrm2w(const XVector& x, const XVector& w) { /// \brief R(i,j) = nrm2w(X(i,j)) /// -/// Replace each entry in R with the nrm2wolute value (magnitude) of the +/// Replace each entry in R with the nrm2w, absolute value (magnitude), of the /// corresponding entry in X. /// /// \tparam RMV 1-D or 2-D Kokkos::View specialization. diff --git a/blas/src/KokkosBlas1_reciprocal.hpp b/blas/src/KokkosBlas1_reciprocal.hpp index 7e171cb6df..19624d11c9 100644 --- a/blas/src/KokkosBlas1_reciprocal.hpp +++ b/blas/src/KokkosBlas1_reciprocal.hpp @@ -25,8 +25,8 @@ namespace KokkosBlas { /// \brief R(i,j) = reciprocal(X(i,j)) /// -/// Replace each entry in R with the reciprocalolute value (magnitude) of the -/// corresponding entry in X. +/// Replace each entry in R with the absolute value (magnitude), of the +/// reciprocal of the corresponding entry in X. /// /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have diff --git a/blas/src/KokkosBlas3_trsm.hpp b/blas/src/KokkosBlas3_trsm.hpp index 8de879b1f3..2e8d2f4cfa 100644 --- a/blas/src/KokkosBlas3_trsm.hpp +++ b/blas/src/KokkosBlas3_trsm.hpp @@ -42,8 +42,9 @@ namespace KokkosBlas { /// "L" or "l" indicates matrix A lower part is stored, the /// other part is not referenced /// \param trans [in] "N" or "n" for non-transpose, "T" or "t" for transpose, -/// "C" or "c" for conjugate transpose. \param diag [in] "U" or "u" indicates -/// the diagonal of A is assumed to be unit +/// "C" or "c" for conjugate transpose. +/// \param diag [in] "U" or "u" indicates the diagonal of A is assumed to be +/// unit // "N" or "n" indicated the diagonal of A is assumed to be // non-unit /// \param alpha [in] Input coefficient used for multiplication with B diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index b98abbbfc8..f66253a5f6 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -366,8 +366,7 @@ display_help_text() { echo "--disable-perftests: Do not build Kokkos Kernels performance tests" echo "--enable-perftests: build Kokkos Kernels performance tests (default)" echo "--deprecated-code Enable deprecated code (disabled by default)" - echo "--enable-perfsuite: build Kokkos Kernels performance tests with -RAJAPerf Suite" + echo "--export-compile-commands: export cmake compile_commands.json file" } @@ -381,6 +380,8 @@ KOKKOSKERNELS_DO_PERFTESTS=ON KOKKOSKERNELS_DO_PERFSUITE=OFF KOKKOSKERNELS_DO_EXAMPLES=ON +CMAKE_EXPORT_COMPILE_COMMANDS=OFF + #Build static libraries by default BUILD_SHARED_LIBRARIES=OFF @@ -539,6 +540,9 @@ do # This is the default KOKKOSKERNELS_DO_TESTS=ON ;; + --export-compile-commands) + CMAKE_EXPORT_COMPILE_COMMANDS=ON + ;; --enable-perfsuite) KOKKOSKERNELS_DO_PERFSUITE=ON ;; @@ -812,6 +816,6 @@ cd $STORE_KOKKOSKERNELS_BUILD_PATH # Configure kokkos-kernels echo "" -echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS=\"${KOKKOS_CXXFLAGS}\" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS=\"${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED}\" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH} +echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS=\"${KOKKOS_CXXFLAGS}\" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS=\"${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED}\" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH} echo "" -cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS="${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED//\"}" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH} +cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS="${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED//\"}" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH} diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 4ce5a98dc0..d3b393ddde 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,6 +1,6 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms - LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE METIS SuperLU Cholmod CUBLAS + LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE METIS SuperLU Cholmod CUBLAS ROCBLAS ROCSPARSE TEST_OPTIONAL_TPLS yaml-cpp ) # NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in diff --git a/cmake/Modules/FindTPLROCBLAS.cmake b/cmake/Modules/FindTPLROCBLAS.cmake index 0217e8cf2c..c0a9de3b50 100644 --- a/cmake/Modules/FindTPLROCBLAS.cmake +++ b/cmake/Modules/FindTPLROCBLAS.cmake @@ -1,37 +1,13 @@ -IF (ROCBLAS_LIBRARY_DIRS AND ROCBLAS_LIBRARIES) - KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES} LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) -ELSEIF (ROCBLAS_LIBRARIES) - KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES}) -ELSEIF (ROCBLAS_LIBRARY_DIRS) - KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES rocblas LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) -ELSEIF (KokkosKernels_ROCBLAS_ROOT) - KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE - LIBRARIES - rocblas - LIBRARY_PATHS - ${KokkosKernels_ROCBLAS_ROOT}/lib - HEADERS - rocblas.h - HEADER_PATHS - ${KokkosKernels_ROCBLAS_ROOT}/include - ) -ELSEIF (DEFINED ENV{ROCM_PATH}) - MESSAGE(STATUS "Detected ROCM_PATH: ENV{ROCM_PATH}") - SET(ROCBLAS_ROOT "$ENV{ROCM_PATH}/rocblas") - KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE - LIBRARIES - rocblas - LIBRARY_PATHS - ${ROCBLAS_ROOT}/lib - HEADERS - rocblas.h - HEADER_PATHS - ${ROCBLAS_ROOT}/include - ) +# MPL: 12/29/2022: CMake regular way to find a package +FIND_PACKAGE(ROCBLAS) +if(TARGET roc::rocblas) +## MPL: 12/29/2022: Variable TPL_ROCBLAS_IMPORTED_NAME follows the requested convention +## of KokkosKernel (method kokkoskernels_import_tpl of kokkoskernels_tpls.cmake) + SET(TPL_ROCBLAS_IMPORTED_NAME roc::rocblas) + SET(TPL_IMPORTED_NAME roc::rocblas) +## MPL: 12/29/2022: A target comming from a TPL must follows the requested convention +## of KokkosKernel (method kokkoskernels_link_tpl of kokkoskernels_tpls.cmake) + ADD_LIBRARY(KokkosKernels::ROCBLAS ALIAS roc::rocblas) ELSE() - MESSAGE(ERROR "rocBLAS was not detected properly, please disable it or provide sufficient information at configure time.") - # Todo: figure out how to use the target defined during rocblas installation - # FIND_PACKAGE(ROCBLAS REQUIRED) - # KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCBLAS INTERFACE LINK_LIBRARIES ${ROCBLAS_LIBRARIES}) - # GET_TARGET_PROPERTY(ROCBLAS_LINK_LIBRARIES ${ROCBLAS_LIBRARIES} IMPORTED_LINK_INTERFACE_LIBRARIES) -ENDIF() \ No newline at end of file + MESSAGE(FATAL_ERROR "Package ROCBLAS requested but not found") +ENDIF() diff --git a/cmake/Modules/FindTPLROCSPARSE.cmake b/cmake/Modules/FindTPLROCSPARSE.cmake index 52a0261b48..5f985ff3a8 100644 --- a/cmake/Modules/FindTPLROCSPARSE.cmake +++ b/cmake/Modules/FindTPLROCSPARSE.cmake @@ -1,37 +1,9 @@ -IF (ROCSPARSE_LIBRARY_DIRS AND ROCSPARSE_LIBRARIES) - KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES} LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) -ELSEIF (ROCSPARSE_LIBRARIES) - KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES}) -ELSEIF (ROCSPARSE_LIBRARY_DIRS) - KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES rocsparse LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) -ELSEIF (KokkosKernels_ROCSPARSE_ROOT) - KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE - LIBRARIES - rocsparse - LIBRARY_PATHS - ${KokkosKernels_ROCSPARSE_ROOT}/lib - HEADERS - rocsparse.h - HEADER_PATHS - ${KokkosKernels_ROCSPARSE_ROOT}/include - ) -ELSEIF (DEFINED ENV{ROCM_PATH}) - MESSAGE(STATUS "Detected ROCM_PATH: ENV{ROCM_PATH}") - SET(ROCSPARSE_ROOT "$ENV{ROCM_PATH}/rocsparse") - KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE - LIBRARIES - rocsparse - LIBRARY_PATHS - ${ROCSPARSE_ROOT}/lib - HEADERS - rocsparse.h - HEADER_PATHS - ${ROCSPARSE_ROOT}/include - ) +# MPL: 05/01/2023: This file follows the partern of FindTPLROCBLAS.cmake +FIND_PACKAGE(ROCSPARSE) +if(TARGET roc::rocsparse) + SET(TPL_ROCSPARSE_IMPORTED_NAME roc::rocsparse) + SET(TPL_IMPORTED_NAME roc::rocsparse) + ADD_LIBRARY(KokkosKernels::ROCSPARSE ALIAS roc::rocsparse) ELSE() - MESSAGE(ERROR "rocSPARSE was not detected properly, please disable it or provide sufficient information at configure time.") - # Todo: figure out how to use the target defined during rocsparse installation - # FIND_PACKAGE(ROCSPARSE REQUIRED) - # KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSPARSE INTERFACE LINK_LIBRARIES ${ROCSPARSE_LIBRARIES}) - # GET_TARGET_PROPERTY(ROCSPARSE_LINK_LIBRARIES ${ROCSPARSE_LIBRARIES} IMPORTED_LINK_INTERFACE_LIBRARIES) -ENDIF() \ No newline at end of file + MESSAGE(FATAL_ERROR "Package ROCSPARSE requested but not found") +ENDIF() diff --git a/cmake/kokkoskernels_components.cmake b/cmake/kokkoskernels_components.cmake index f33a62b6ff..56ab1a7c31 100644 --- a/cmake/kokkoskernels_components.cmake +++ b/cmake/kokkoskernels_components.cmake @@ -45,14 +45,6 @@ KOKKOSKERNELS_ADD_OPTION( "Whether to build the graph component. Default: OFF" ) -# The user requested individual components, -# the assumption is that a full build is not -# desired and ENABLE_ALL_COMPONENETS is turned -# off. -IF (KokkosKernels_ENABLE_COMPONENT_BATCHED OR KokkosKernels_ENABLE_COMPONENT_BLAS - OR KokkosKernels_ENABLE_COMPONENT_GRAPH OR KokkosKernels_ENABLE_COMPONENT_SPARSE) - SET(KokkosKernels_ENABLE_ALL_COMPONENTS OFF CACHE BOOL "" FORCE) -ENDIF() # Graph depends on everything else because it depends # on Sparse at the moment, breaking that dependency will @@ -72,13 +64,24 @@ IF (KokkosKernels_ENABLE_COMPONENT_SPARSE) SET(KokkosKernels_ENABLE_COMPONENT_GRAPH ON CACHE BOOL "" FORCE) ENDIF() -# At this point, if ENABLE_ALL_COMPONENTS is -# still ON we need to enable all individual -# components as they are required for this -# build. +# If user requested to enable all components, enable all components IF (KokkosKernels_ENABLE_ALL_COMPONENTS) SET(KokkosKernels_ENABLE_COMPONENT_BATCHED ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_BLAS ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_SPARSE ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_GRAPH ON CACHE BOOL "" FORCE) ENDIF() + +# KOKKOSKERNELS_ALL_COMPONENTS_ENABLED says whether all components are on, +# regardless of how this came to be +# this is in the cache so we can use it as a global variable, +# but marking it as advanced should hide it from GUIs +IF ( KokkosKernels_ENABLE_COMPONENT_BATCHED + AND KokkosKernels_ENABLE_COMPONENT_BLAS + AND KokkosKernels_ENABLE_COMPONENT_GRAPH + AND KokkosKernels_ENABLE_COMPONENT_SPARSE) + SET(KOKKOSKERNELS_ALL_COMPONENTS_ENABLED ON CACHE BOOL "" FORCE) +ELSE() + SET(KOKKOSKERNELS_ALL_COMPONENTS_ENABLED OFF CACHE BOOL "" FORCE) +ENDIF() +mark_as_advanced(FORCE KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) \ No newline at end of file diff --git a/common/src/KokkosKernels_Error.hpp b/common/src/KokkosKernels_Error.hpp index e4e4981973..9ebb104378 100644 --- a/common/src/KokkosKernels_Error.hpp +++ b/common/src/KokkosKernels_Error.hpp @@ -53,4 +53,60 @@ inline void hip_internal_safe_call(hipError_t e, const char *name, } // namespace Impl } // namespace KokkosKernels +/* + * Asserts and error checking macros/functions. + * + * KK_KERNEL** are for error checking within kokkos kernels. + * + * Any check with "assert" in the name is disabled for release builds + * + * For _MSG checks, the msg argument can contain '<<' if not a kernel check. + * + * This code is adapted from EKAT/src/ekat/ekat_assert.hpp + */ + +// Internal do not call directly +#define IMPL_THROW(condition, msg, exception_type) \ + do { \ + if (!(condition)) { \ + std::stringstream _ss_; \ + _ss_ << __FILE__ << ":" << __LINE__ << ": FAIL:\n" << #condition; \ + _ss_ << "\n" << msg; \ + throw exception_type(_ss_.str()); \ + } \ + } while (0) + +// SYCL cannot printf like the other backends quite yet +#define IMPL_KERNEL_THROW(condition, msg) \ + do { \ + if (!(condition)) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("KERNEL CHECK FAILED:\n %s\n %s\n", \ + #condition, msg); \ + Kokkos::abort(""); \ + } \ + } while (0) + +#ifndef NDEBUG +#define KK_ASSERT(condition) IMPL_THROW(condition, "", std::logic_error) +#define KK_ASSERT_MSG(condition, msg) \ + IMPL_THROW(condition, msg, std::logic_error) +#define KK_KERNEL_ASSERT(condition) IMPL_KERNEL_THROW(condition, "") +#define KK_KERNEL_ASSERT_MSG(condition, msg) IMPL_KERNEL_THROW(condition, msg) +#else +#define KK_ASSERT(condition) ((void)(0)) +#define KK_ASSERT_MSG(condition, msg) ((void)(0)) +#define KK_KERNEL_ASSERT(condition) ((void)(0)) +#define KK_KERNEL_ASSERT_MSG(condition, msg) ((void)(0)) +#endif + +#define KK_REQUIRE(condition) IMPL_THROW(condition, "", std::logic_error) +#define KK_REQUIRE_MSG(condition, msg) \ + IMPL_THROW(condition, msg, std::logic_error) + +#define KK_KERNEL_REQUIRE(condition) IMPL_KERNEL_THROW(condition, "") +#define KK_KERNEL_REQUIRE_MSG(condition, msg) IMPL_KERNEL_THROW(condition, msg) + +#define KK_ERROR_MSG(msg) KK_REQUIRE_MSG(false, msg) +#define KK_KERNEL_ERROR_MSG(msg) KK_KERNEL_REQUIRE_MSG(false, msg) + #endif // KOKKOSKERNELS_ERROR_HPP diff --git a/example/gmres/test_prec.cpp b/example/gmres/test_prec.cpp index f4aca0d6f6..8d1ff74b87 100644 --- a/example/gmres/test_prec.cpp +++ b/example/gmres/test_prec.cpp @@ -27,9 +27,11 @@ int main(int argc, char* argv[]) { using OT = int; using EXSP = Kokkos::DefaultExecutionSpace; using MESP = typename EXSP::memory_space; - using CRS = KokkosSparse::CrsMatrix; + using CRS = + KokkosSparse::CrsMatrix, void, OT>; - using ViewVectorType = Kokkos::View; + using ViewVectorType = + Kokkos::View>; using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; diff --git a/master_history.txt b/master_history.txt index a252fcb06e..6c9f253c07 100644 --- a/master_history.txt +++ b/master_history.txt @@ -19,4 +19,5 @@ tag: 3.6.00 date: 04/06/2022 master: 8381db04 release: a7e683c4 tag: 3.6.01 date: 05/23/2022 master: e09389ae release: e1d8de42 tag: 3.7.00 date: 08/25/2022 master: 42ab7a29 release: 9cc88ffa tag: 3.7.01 date: 12/01/2022 master: 04821ac3 release: 6cb632b6 -tag: 4.0.0 date: 02/32/2023 master: b4014bf2 release: a10dff20 +tag: 4.0.00 date: 02/23/2023 master: b4014bf2 release: a10dff20 +tag: 4.0.01 date: 04/26/2023 master: b9c1bab7 release: 8809e41c diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index bc638b64c1..28752e9c6c 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -28,7 +28,7 @@ if (KokkosKernels_ENABLE_PERFTESTS) KOKKOSKERNELS_INCLUDE_DIRECTORIES(sparse) - if(Kokkos_ENABLE_TESTS_AND_PERFSUITE) + if(KokkosKernels_ENABLE_TESTS_AND_PERFSUITE) #Add RPS implementations of KK perf tests here KOKKOSKERNELS_ADD_EXECUTABLE( tracked_testing diff --git a/sparse/impl/KokkosSparse_gmres_impl.hpp b/sparse/impl/KokkosSparse_gmres_impl.hpp index aa90a70757..8c7231f90c 100644 --- a/sparse/impl/KokkosSparse_gmres_impl.hpp +++ b/sparse/impl/KokkosSparse_gmres_impl.hpp @@ -92,6 +92,7 @@ struct GmresWrap { std::cout << " ortho: " << ((ortho == GmresHandle::Ortho::CGS2) ? "CGS2" : "MGS") << std::endl; + std::cout << " precond: " << (precond ? "ON" : "OFF") << std::endl; } // Make tmp work views diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index 481742aa83..c482aff429 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -78,6 +78,11 @@ struct IlutWrap { const URowMapType& U_row_map, const UEntriesType& U_entries, const UValuesType& U_values, LURowMapType& LU_row_map, LUEntriesType& LU_entries, LUValuesType& LU_values) { + std::string myalg("SPGEMM_KK_MEMORY"); + KokkosSparse::SPGEMMAlgorithm spgemm_algorithm = + KokkosSparse::StringToSPGEMMAlgorithm(myalg); + kh.create_spgemm_handle(spgemm_algorithm); + const size_type nrows = ih.get_nrows(); KokkosSparse::Experimental::spgemm_symbolic( @@ -95,6 +100,8 @@ struct IlutWrap { // Need to sort LU CRS if on CUDA! sort_crs_matrix(LU_row_map, LU_entries, LU_values); + + kh.destroy_spgemm_handle(); } /** @@ -329,11 +336,17 @@ struct IlutWrap { const auto out_val = lpu_col == col_idx ? lpu_val : r_val / diag; // store output entries if (row_idx >= col_idx) { + KK_KERNEL_ASSERT_MSG( + l_new_nnz < L_new_row_map(row_idx + 1), + "add_candidates: Overflowed L_new, is your A matrix sorted?"); L_new_entries(l_new_nnz) = col_idx; L_new_values(l_new_nnz) = row_idx == col_idx ? 1. : out_val; ++l_new_nnz; } if (row_idx <= col_idx) { + KK_KERNEL_ASSERT_MSG( + u_new_nnz < U_new_row_map(row_idx + 1), + "add_candidates: Overflowed U_new, is your A matrix sorted?"); U_new_entries(u_new_nnz) = col_idx; U_new_values(u_new_nnz) = out_val; ++u_new_nnz; @@ -407,7 +420,8 @@ struct IlutWrap { const auto l_col = L_entries(l_row_nnz); const auto u_row = Ut_entries(ut_row_nnz); if (l_col == u_row && l_col < last_entry) { - sum += L_values(l_row_nnz) * Ut_values(ut_row_nnz); + const scalar_t ut_val = Ut_values(ut_row_nnz); + sum += L_values(l_row_nnz) * ut_val; } if (static_cast(u_row) == row_idx) { ut_nnz = ut_row_nnz; @@ -420,65 +434,83 @@ struct IlutWrap { return Kokkos::make_pair(a_val - sum, ut_nnz); } - template - KOKKOS_FUNCTION static void compute_l_u_factors_impl( - const ARowMapType& A_row_map, const AEntriesType& A_entries, - const AValuesType& A_values, LRowMapType& L_row_map, - LEntriesType& L_entries, LValuesType& L_values, URowMapType& U_row_map, - UEntriesType& U_entries, UValuesType& U_values, UtRowMapType& Ut_row_map, - UtEntriesType& Ut_entries, UtValuesType& Ut_values, MemberType& team) { - const auto row_idx = team.league_rank(); - - const auto l_row_nnz_begin = L_row_map(row_idx); - const auto l_row_nnz_end = L_row_map(row_idx + 1); + /** + * Implements a single iteration/sweep of the fixed-point ILU algorithm. + * The results of this function are non-deterministic due to concurrent + * reading and writing of Ut values. async_update can be set to false to + * make this function determistic, but that could cause par_ilut + * to take longer (more iterations) to converge. + */ + template + static void compute_l_u_factors_impl( + IlutHandle& ih, const ARowMapType& A_row_map, + const AEntriesType& A_entries, const AValuesType& A_values, + LRowMapType& L_row_map, LEntriesType& L_entries, LValuesType& L_values, + URowMapType& U_row_map, UEntriesType& U_entries, UValuesType& U_values, + UtRowMapType& Ut_row_map, UtEntriesType& Ut_entries, + UtValuesType& Ut_values_arg) { + // UtValues needs to be Atomic if async updates are on. Otherwise, + // non-atomic is fine. + using UtValuesSafeType = std::conditional_t< + async_update, + Kokkos::View< + typename UtValuesType::non_const_value_type*, + typename UtValuesType::array_layout, + typename UtValuesType::device_type, + Kokkos::MemoryTraits >, + UtValuesType>; + + UtValuesSafeType Ut_values = Ut_values_arg; + const size_type nrows = ih.get_nrows(); Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, l_row_nnz_begin, l_row_nnz_end - 1), - [&](const size_type l_nnz) { - const auto col_idx = L_entries(l_nnz); - const auto u_diag = Ut_values(Ut_row_map(col_idx + 1) - 1); - if (u_diag != 0.0) { - const auto new_val = - compute_sum(row_idx, col_idx, A_row_map, A_entries, A_values, - L_row_map, L_entries, L_values, Ut_row_map, - Ut_entries, Ut_values) - .first / - u_diag; - L_values(l_nnz) = new_val; + "compute_l_u_factors", range_policy(0, nrows), + KOKKOS_LAMBDA(const size_type row_idx) { + const auto l_row_nnz_begin = L_row_map(row_idx); + const auto l_row_nnz_end = + L_row_map(row_idx + 1) - 1; // skip diagonal for L + + for (auto l_nnz = l_row_nnz_begin; l_nnz < l_row_nnz_end; ++l_nnz) { + const auto col_idx = L_entries(l_nnz); + const scalar_t u_diag = Ut_values(Ut_row_map(col_idx + 1) - 1); + if (u_diag != 0.0) { + const auto new_val = + compute_sum(row_idx, col_idx, A_row_map, A_entries, A_values, + L_row_map, L_entries, L_values, Ut_row_map, + Ut_entries, Ut_values) + .first / + u_diag; + L_values(l_nnz) = new_val; + } } - }); - - team.team_barrier(); - - const auto u_row_nnz_begin = U_row_map(row_idx); - const auto u_row_nnz_end = U_row_map(row_idx + 1); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, u_row_nnz_begin, u_row_nnz_end), - [&](const size_type u_nnz) { - const auto col_idx = U_entries(u_nnz); - const auto sum = compute_sum(row_idx, col_idx, A_row_map, A_entries, - A_values, L_row_map, L_entries, L_values, - Ut_row_map, Ut_entries, Ut_values); - const auto new_val = sum.first; - const auto ut_nnz = sum.second; - U_values(u_nnz) = new_val; - Ut_values(ut_nnz) = new_val; // ut_nnz is not guarateed to fail into - // range used exclusively by this team + const auto u_row_nnz_begin = U_row_map(row_idx); + const auto u_row_nnz_end = U_row_map(row_idx + 1); + + for (auto u_nnz = u_row_nnz_begin; u_nnz < u_row_nnz_end; ++u_nnz) { + const auto col_idx = U_entries(u_nnz); + const auto sum = compute_sum( + row_idx, col_idx, A_row_map, A_entries, A_values, L_row_map, + L_entries, L_values, Ut_row_map, Ut_entries, Ut_values); + const auto new_val = sum.first; + const auto ut_nnz = sum.second; + U_values(u_nnz) = new_val; + + // ut_nnz is not guarateed to fail into range used exclusively + // by this thread. Updating it here opens up potential race + // conditions but usually causes faster convergence. + if (async_update) { + Ut_values(ut_nnz) = new_val; + } + } }); } - /** - * Implements a single iteration/sweep of the fixed-point ILU algorithm. - * The results of this function are non-deterministic due to concurrent - * reading and writing of Ut values. deterministic can be set to true to - * make this function determistic, but it will be run in Serial exe space - * if so. - */ template ; - using smember_type = typename spolicy_type::member_type; - - const size_type nrows = ih.get_nrows(); - spolicy_type policy(nrows, 1); - - auto A_row_map_h = Kokkos::create_mirror_view(A_row_map); - auto A_entries_h = Kokkos::create_mirror_view(A_entries); - auto A_values_h = Kokkos::create_mirror_view(A_values); - auto L_row_map_h = Kokkos::create_mirror_view(L_row_map); - auto L_entries_h = Kokkos::create_mirror_view(L_entries); - auto L_values_h = Kokkos::create_mirror_view(L_values); - auto U_row_map_h = Kokkos::create_mirror_view(U_row_map); - auto U_entries_h = Kokkos::create_mirror_view(U_entries); - auto U_values_h = Kokkos::create_mirror_view(U_values); - auto Ut_row_map_h = Kokkos::create_mirror_view(Ut_row_map); - auto Ut_entries_h = Kokkos::create_mirror_view(Ut_entries); - auto Ut_values_h = Kokkos::create_mirror_view(Ut_values); - - Kokkos::deep_copy(A_row_map_h, A_row_map); - Kokkos::deep_copy(A_entries_h, A_entries); - Kokkos::deep_copy(A_values_h, A_values); - Kokkos::deep_copy(L_row_map_h, L_row_map); - Kokkos::deep_copy(L_entries_h, L_entries); - Kokkos::deep_copy(L_values_h, L_values); - Kokkos::deep_copy(U_row_map_h, U_row_map); - Kokkos::deep_copy(U_entries_h, U_entries); - Kokkos::deep_copy(U_values_h, U_values); - Kokkos::deep_copy(Ut_row_map_h, Ut_row_map); - Kokkos::deep_copy(Ut_entries_h, Ut_entries); - Kokkos::deep_copy(Ut_values_h, Ut_values); - - Kokkos::parallel_for( - "compute_l_u_factors", policy, - KOKKOS_LAMBDA(const smember_type& team) { - compute_l_u_factors_impl( - A_row_map_h, A_entries_h, A_values_h, L_row_map_h, L_entries_h, - L_values_h, U_row_map_h, U_entries_h, U_values_h, Ut_row_map_h, - Ut_entries_h, Ut_values_h, team); - }); - - Kokkos::deep_copy(L_values, L_values_h); - Kokkos::deep_copy(U_values, U_values_h); - Kokkos::deep_copy(Ut_values, Ut_values_h); -#else - throw std::runtime_error( - "compute_l_u factors cannot be deterministic without Kokkos::Serial " - "available"); -#endif + UtValuesType& Ut_values, const bool async_update) { + if (async_update) { + compute_l_u_factors_impl( + ih, A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, + U_row_map, U_entries, U_values, Ut_row_map, Ut_entries, Ut_values); } else { - const auto policy = ih.get_default_team_policy(); - - Kokkos::parallel_for( - "compute_l_u_factors", policy, - KOKKOS_LAMBDA(const member_type& team) { - compute_l_u_factors_impl(A_row_map, A_entries, A_values, L_row_map, - L_entries, L_values, U_row_map, U_entries, - U_values, Ut_row_map, Ut_entries, - Ut_values, team); - }); + compute_l_u_factors_impl( + ih, A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, + U_row_map, U_entries, U_values, Ut_row_map, Ut_entries, Ut_values); } } @@ -716,6 +694,8 @@ struct IlutWrap { RRowMapType& R_row_map, REntriesType& R_entries, RValuesType& R_values, LURowMapType& LU_row_map, LUEntriesType& LU_entries, LUValuesType& LU_values) { + scalar_t result; + multiply_matrices(kh, ih, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, LU_row_map, LU_entries, LU_values); @@ -731,8 +711,6 @@ struct IlutWrap { &kh, A_row_map, A_entries, A_values, 1., LU_row_map, LU_entries, LU_values, -1., R_row_map, R_entries, R_values); - scalar_t result; - auto policy = ih.get_default_team_policy(); Kokkos::parallel_reduce( @@ -840,7 +818,8 @@ struct IlutWrap { const AValuesType& A_values, LRowMapType& L_row_map, LEntriesType& L_entries, LValuesType& L_values, URowMapType& U_row_map, UEntriesType& U_entries, - UValuesType& U_values, bool deterministic) { + UValuesType& U_values) { + // Get config settings from handle const size_type nrows = thandle.get_nrows(); const auto fill_in_limit = thandle.get_fill_in_limit(); const auto l_nnz_limit = @@ -852,10 +831,18 @@ struct IlutWrap { thandle.get_residual_norm_delta_stop(); const size_type max_iter = thandle.get_max_iter(); - std::string myalg("SPGEMM_KK_MEMORY"); - KokkosSparse::SPGEMMAlgorithm spgemm_algorithm = - KokkosSparse::StringToSPGEMMAlgorithm(myalg); - kh.create_spgemm_handle(spgemm_algorithm); + const auto verbose = thandle.get_verbose(); + const auto async_update = false; // thandle.get_async_update(); + + if (verbose) { + std::cout << "Starting PARILUT with..." << std::endl; + std::cout << " num_rows: " << nrows << std::endl; + std::cout << " fill_in_limit: " << fill_in_limit << std::endl; + std::cout << " max_iter: " << max_iter << std::endl; + std::cout << " res_norm_delta_stop: " << residual_norm_delta_stop + << std::endl; + std::cout << " async_update: " << async_update << std::endl; + } kh.create_spadd_handle(true /*we expect inputs to be sorted*/); @@ -882,8 +869,8 @@ struct IlutWrap { auto V_copy = Kokkos::create_mirror_view(V_copy_d); size_type itr = 0; + scalar_t curr_residual = std::numeric_limits::max(); scalar_t prev_residual = std::numeric_limits::max(); - bool converged = false; // Set the initial L/U values for the initial approximation initialize_LU(thandle, A_row_map, A_entries, A_values, L_row_map, L_entries, @@ -892,7 +879,8 @@ struct IlutWrap { // // main loop // - while (!converged && itr < max_iter) { + bool stop = nrows == 0; // Don't iterate at all if nrows=0 + while (!stop && itr < max_iter) { // LU = L*U if (prev_residual == std::numeric_limits::max()) { multiply_matrices(kh, thandle, L_row_map, L_entries, L_values, @@ -915,7 +903,7 @@ struct IlutWrap { compute_l_u_factors( thandle, A_row_map, A_entries, A_values, L_new_row_map, L_new_entries, L_new_values, U_new_row_map, U_new_entries, U_new_values, - Ut_new_row_map, Ut_new_entries, Ut_new_values, deterministic); + Ut_new_row_map, Ut_new_entries, Ut_new_values, async_update); // Filter smallest elements from L_new and U_new. Store result back // in L and U. @@ -949,18 +937,28 @@ struct IlutWrap { compute_l_u_factors(thandle, A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, Ut_new_row_map, Ut_new_entries, Ut_new_values, - deterministic); + async_update); - // Compute residual and terminate if converged + // Compute residual and check stop conditions { - const auto curr_residual = compute_residual_norm( + curr_residual = compute_residual_norm( kh, thandle, A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, R_row_map, R_entries, R_values, LU_row_map, LU_entries, LU_values); - if (karith::abs(prev_residual - curr_residual) <= - karith::abs(residual_norm_delta_stop)) { - converged = true; + if (verbose) { + std::cout << "Completed itr " << itr + << ", residual is: " << curr_residual << std::endl; + } + + const auto curr_delta = karith::abs(prev_residual - curr_residual); + if (curr_delta <= residual_norm_delta_stop) { + if (verbose) { + std::cout << " Itr-to-itr residual change has dropped below " + "residual_norm_delta_stop, stop" + << std::endl; + } + stop = true; } else { prev_residual = curr_residual; } @@ -969,7 +967,13 @@ struct IlutWrap { ++itr; } - kh.destroy_spgemm_handle(); + curr_residual = nrows == 0 ? scalar_t(0.) : curr_residual; + if (verbose) { + std::cout << "PAR_ILUT stopped in " << itr << " iterations with residual " + << curr_residual << std::endl; + } + thandle.set_stats(itr, curr_residual); + kh.destroy_spadd_handle(); } // end ilut_numeric diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp index 72130a07ad..fd3bc2b8bb 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp @@ -114,8 +114,7 @@ struct PAR_ILUT_NUMERIC { const AValuesType &A_values, LRowMapType &L_row_map, LEntriesType &L_entries, LValuesType &L_values, URowMapType &U_row_map, - UEntriesType &U_entries, UValuesType &U_values, - bool deterministic = false); + UEntriesType &U_entries, UValuesType &U_values); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY @@ -135,15 +134,14 @@ struct PAR_ILUT_NUMERICget_par_ilut_handle(); using Ilut = Experimental::IlutWrap< typename std::remove_pointer::type>; Ilut::ilut_numeric(*handle, *par_ilut_handle, A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, U_row_map, - U_entries, U_values, deterministic); + U_entries, U_values); } }; diff --git a/sparse/impl/KokkosSparse_par_ilut_symbolic_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_symbolic_impl.hpp index 573f550832..4e331d2f11 100644 --- a/sparse/impl/KokkosSparse_par_ilut_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_symbolic_impl.hpp @@ -44,6 +44,10 @@ void ilut_symbolic(IlutHandle& thandle, const ARowMapType& A_row_map_d, using size_type = typename IlutHandle::size_type; using Ilut = IlutWrap; + const size_type a_nrows = A_row_map_d.extent(0); + const size_type nrows = a_nrows > 0 ? (a_nrows - 1) : 0; + thandle.set_nrows(nrows); + const auto policy = thandle.get_default_team_policy(); // Sizing for the initial L/U approximation @@ -83,8 +87,10 @@ void ilut_symbolic(IlutHandle& thandle, const ARowMapType& A_row_map_d, const size_type nnzsL = Ilut::prefix_sum(L_row_map_d); const size_type nnzsU = Ilut::prefix_sum(U_row_map_d); + // Set symbolic info on handle thandle.set_nnzL(nnzsL); thandle.set_nnzU(nnzsU); + thandle.set_symbolic_complete(); } // end ilut_symbolic diff --git a/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp b/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp index a27a685a00..b822d12ab0 100644 --- a/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp @@ -108,7 +108,6 @@ struct PAR_ILUT_SYMBOLICset_symbolic_complete(); } }; #endif diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 7090c4f948..17611c3f2c 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -299,6 +299,10 @@ void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle, typedef typename KernelHandle::scalar_t scalar_type; typedef typename KernelHandle::memory_space memory_space; + (void)row_map; + (void)entries; + (void)values; + const bool is_cuda_space = std::is_same::value || std::is_same::value || diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index 12f3614a07..598cf9577d 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -871,12 +871,17 @@ class KokkosKernelsHandle { } PAR_ILUTHandleType *get_par_ilut_handle() { return this->par_ilutHandle; } - void create_par_ilut_handle(const size_type nrows, const size_type nnzL = 0, - const size_type nnzU = 0, - const size_type max_iter = 1) { + void create_par_ilut_handle( + const size_type max_iter = 20, + const typename PAR_ILUTHandleType::float_t residual_norm_delta_stop = + 1e-2, + const typename PAR_ILUTHandleType::float_t fill_in_limit = 0.75, + const bool async_update = false, const bool verbose = false) { this->destroy_par_ilut_handle(); this->is_owner_of_the_par_ilut_handle = true; - this->par_ilutHandle = new PAR_ILUTHandleType(nrows, nnzL, nnzU, max_iter); + this->par_ilutHandle = + new PAR_ILUTHandleType(max_iter, residual_norm_delta_stop, + fill_in_limit, async_update, verbose); this->par_ilutHandle->set_team_size(this->team_work_size); this->par_ilutHandle->set_vector_size(this->vector_size); } diff --git a/sparse/src/KokkosSparse_IOUtils.hpp b/sparse/src/KokkosSparse_IOUtils.hpp index d957cf4949..77934b4f3e 100644 --- a/sparse/src/KokkosSparse_IOUtils.hpp +++ b/sparse/src/KokkosSparse_IOUtils.hpp @@ -117,16 +117,42 @@ void kk_diagonally_dominant_sparseMatrix_generate( OrdinalType row_size_variance, OrdinalType bandwidth, ScalarType *&values, SizeType *&rowPtr, OrdinalType *&colInd, ScalarType diagDominance = 10 * Kokkos::ArithTraits::one()) { - rowPtr = new SizeType[nrows + 1]; - + rowPtr = new SizeType[nrows + 1]; OrdinalType elements_per_row = nnz / nrows; + // Set a hard limit to the actual entries in any one row, so that the + // loop to find a column not already taken will terminate quickly. + OrdinalType max_elements_per_row = 0.7 * bandwidth; + OrdinalType requested_max_elements_per_row = + elements_per_row + 0.5 * row_size_variance; + if (requested_max_elements_per_row > max_elements_per_row) { + std::cerr + << "kk_diagonally_dominant_sparseMatrix_generate: given the bandwidth (" + << bandwidth << "),\n"; + std::cerr << " can insert a maximum of " << max_elements_per_row + << " entries per row (0.7*bandwidth).\n"; + std::cerr << " But given the requested average entries per row of " + << elements_per_row << " and variance of " << row_size_variance + << ",\n"; + std::cerr << " there should be up to " << requested_max_elements_per_row + << " entries per row.\n"; + std::cerr << " Increase the bandwidth, or decrease nnz and/or " + "row_size_variance.\n"; + throw std::invalid_argument( + "kk_diagonally_dominant_sparseMatrix_generate: requested too many " + "entries per row for the given bandwidth."); + } srand(13721); rowPtr[0] = 0; for (int row = 0; row < nrows; row++) { - int varianz = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance; - if (varianz < 1) varianz = 1; - if (varianz > 0.75 * ncols) varianz = 0.75 * ncols; - rowPtr[row + 1] = rowPtr[row] + elements_per_row + varianz; + // variance is how many more (or less) entries this row has compared to the + // mean (elements_per_row). + OrdinalType variance = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance; + OrdinalType entries_in_row = elements_per_row + variance; + // Always have at least one entry (for the diagonal) + if (entries_in_row < 1) entries_in_row = 1; + if (entries_in_row > max_elements_per_row) + entries_in_row = max_elements_per_row; + rowPtr[row + 1] = rowPtr[row] + entries_in_row; if (rowPtr[row + 1] <= rowPtr[row]) // This makes sure that there is rowPtr[row + 1] = rowPtr[row] + 1; // at least one nonzero in the row } @@ -141,6 +167,9 @@ void kk_diagonally_dominant_sparseMatrix_generate( for (SizeType k = rowPtr[row]; k < rowPtr[row + 1] - 1; k++) { while (true) { OrdinalType pos = (1.0 * rand() / RAND_MAX - 0.5) * bandwidth + row; + // When bandwidth would extend past the columns of the matrix, wrap + // the entry around to the other side. This means the final matrix can + // actually have structural bandwidth close to ncols. while (pos < 0) pos += ncols; while (pos >= ncols) pos -= ncols; diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp new file mode 100644 index 0000000000..a257b8f09c --- /dev/null +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -0,0 +1,136 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// ************************************************************************ +//@HEADER + +/// @file KokkosSparse_LUPrec.hpp + +#ifndef KK_LU_PREC_HPP +#define KK_LU_PREC_HPP + +#include +#include +#include +#include + +namespace KokkosSparse { +namespace Experimental { + +/// \class LUPrec +/// \brief This class is for applying LU preconditioning. +/// It takes L and U and the apply method returns U^inv L^inv x +/// \tparam CRS the CRS type of L and U +/// +/// Preconditioner provides the following methods +/// - initialize() Does nothing; members initialized upon object construction. +/// - isInitialized() returns true +/// - compute() Does nothing; members initialized upon object construction. +/// - isComputed() returns true +/// +template +class LUPrec : public KokkosSparse::Experimental::Preconditioner { + public: + using ScalarType = typename std::remove_const::type; + using EXSP = typename CRS::execution_space; + using MEMSP = typename CRS::memory_space; + using karith = typename Kokkos::ArithTraits; + using View1d = typename Kokkos::View; + + private: + // trsm takes host views + CRS _L, _U; + View1d _tmp, _tmp2; + mutable KernelHandle _khL; + mutable KernelHandle _khU; + + public: + //! Constructor: + template + LUPrec(const CRSArg &L, const CRSArg &U) + : _L(L), + _U(U), + _tmp("LUPrec::_tmp", L.numRows()), + _tmp2("LUPrec::_tmp", L.numRows()), + _khL(), + _khU() { + KK_REQUIRE_MSG(L.numRows() == U.numRows(), + "LUPrec: L.numRows() != U.numRows()"); + + _khL.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, L.numRows(), + true); + _khU.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, U.numRows(), + false); + } + + //! Destructor. + virtual ~LUPrec() { + _khL.destroy_sptrsv_handle(); + _khU.destroy_sptrsv_handle(); + } + + ///// \brief Apply the preconditioner to X, putting the result in Y. + ///// + ///// \tparam XViewType Input vector, as a 1-D Kokkos::View + ///// \tparam YViewType Output vector, as a nonconst 1-D Kokkos::View + ///// + ///// \param transM [in] Not used. + ///// \param alpha [in] Not used + ///// \param beta [in] Not used. + ///// + ///// It takes L and U and the stores U^inv L^inv X in Y + // + virtual void apply( + const Kokkos::View> &X, + const Kokkos::View> &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { + // tmp = trsv(L, x); //Apply L^inv to x + // y = trsv(U, tmp); //Apply U^inv to tmp + + KK_REQUIRE_MSG(transM[0] == NoTranspose[0], + "LUPrec::apply only supports 'N' for transM"); + + sptrsv_symbolic(&_khL, _L.graph.row_map, _L.graph.entries); + sptrsv_solve(&_khL, _L.graph.row_map, _L.graph.entries, _L.values, X, _tmp); + + sptrsv_symbolic(&_khU, _U.graph.row_map, _U.graph.entries); + sptrsv_solve(&_khU, _U.graph.row_map, _U.graph.entries, _U.values, _tmp, + _tmp2); + + KokkosBlas::axpby(alpha, _tmp2, beta, Y); + } + //@} + + //! Set this preconditioner's parameters. + void setParameters() {} + + void initialize() {} + + //! True if the preconditioner has been successfully initialized, else false. + bool isInitialized() const { return true; } + + void compute() {} + + //! True if the preconditioner has been successfully computed, else false. + bool isComputed() const { return true; } + + //! True if the preconditioner implements a transpose operator apply. + bool hasTransposeApply() const { return true; } +}; + +} // namespace Experimental +} // End namespace KokkosSparse + +#endif diff --git a/sparse/src/KokkosSparse_MatrixPrec.hpp b/sparse/src/KokkosSparse_MatrixPrec.hpp index 952036b7c7..3cef1b6315 100644 --- a/sparse/src/KokkosSparse_MatrixPrec.hpp +++ b/sparse/src/KokkosSparse_MatrixPrec.hpp @@ -32,13 +32,9 @@ namespace Experimental { /// already has a matrix representation of their /// preconditioner M. The class applies an /// SpMV with M as the preconditioning step. -/// \tparam ScalarType Type of the matrix's entries -/// \tparam Layout Kokkos layout of vectors X and Y to which -/// the preconditioner is applied -/// \tparam EXSP Execution space for the preconditioner apply -/// \tparam Ordinal Type of the matrix's indices; +/// \tparam CRS the type of compressed matrix /// -/// Preconditioner provides the following methods +/// MatrixPrec provides the following methods /// - initialize() Does nothing; Matrix initialized upon object construction. /// - isInitialized() returns true /// - compute() Does nothing; Matrix initialized upon object construction. @@ -47,19 +43,17 @@ namespace Experimental { template class MatrixPrec : public KokkosSparse::Experimental::Preconditioner { private: - CRS A; - - bool isInitialized_ = true; - bool isComputed_ = true; + CRS _A; public: using ScalarType = typename std::remove_const::type; using EXSP = typename CRS::execution_space; + using MEMSP = typename CRS::memory_space; using karith = typename Kokkos::ArithTraits; //! Constructor: template - MatrixPrec(const CRSArg &mat) : A(mat) {} + MatrixPrec(const CRSArg &mat) : _A(mat) {} //! Destructor. virtual ~MatrixPrec() {} @@ -80,12 +74,12 @@ class MatrixPrec : public KokkosSparse::Experimental::Preconditioner { ///\cdot X\f$. ///// The typical case is \f$\beta = 0\f$ and \f$\alpha = 1\f$. // - virtual void apply(const Kokkos::View &X, - const Kokkos::View &Y, - const char transM[] = "N", - ScalarType alpha = karith::one(), - ScalarType beta = karith::zero()) const { - KokkosSparse::spmv(transM, alpha, A, X, beta, Y); + virtual void apply( + const Kokkos::View> &X, + const Kokkos::View> &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { + KokkosSparse::spmv(transM, alpha, _A, X, beta, Y); } //@} @@ -95,12 +89,12 @@ class MatrixPrec : public KokkosSparse::Experimental::Preconditioner { void initialize() {} //! True if the preconditioner has been successfully initialized, else false. - bool isInitialized() const { return isInitialized_; } + bool isInitialized() const { return true; } void compute() {} //! True if the preconditioner has been successfully computed, else false. - bool isComputed() const { return isComputed_; } + bool isComputed() const { return true; } //! True if the preconditioner implements a transpose operator apply. bool hasTransposeApply() const { return true; } diff --git a/sparse/src/KokkosSparse_Preconditioner.hpp b/sparse/src/KokkosSparse_Preconditioner.hpp index 1c32d5eeaa..8fdd9398b2 100644 --- a/sparse/src/KokkosSparse_Preconditioner.hpp +++ b/sparse/src/KokkosSparse_Preconditioner.hpp @@ -27,11 +27,7 @@ namespace Experimental { /// \class Preconditioner /// \brief Interface for KokkosKernels preconditioners -/// \tparam ScalarType Type of the matrix's entries -/// \tparam Layout Kokkos layout of vectors X and Y to which -/// the preconditioner is applied -/// \tparam EXSP Execution space for the preconditioner apply -/// \tparam Ordinal Type of the matrix's indices; +/// \tparam CRS Type of the compressed matrix /// /// Preconditioner provides the following methods /// - initialize() performs all operations based on the graph of the @@ -56,6 +52,7 @@ class Preconditioner { public: using ScalarType = typename std::remove_const::type; using EXSP = typename CRS::execution_space; + using MEMSP = typename CRS::memory_space; using karith = typename Kokkos::ArithTraits; //! Constructor: @@ -80,11 +77,11 @@ class Preconditioner { ///\cdot X\f$. ///// The typical case is \f$\beta = 0\f$ and \f$\alpha = 1\f$. // - virtual void apply(const Kokkos::View &X, - const Kokkos::View &Y, - const char transM[] = "N", - ScalarType alpha = karith::one(), - ScalarType beta = karith::zero()) const = 0; + virtual void apply( + const Kokkos::View> &X, + const Kokkos::View> &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const = 0; //@} //! Set this preconditioner's parameters. diff --git a/sparse/src/KokkosSparse_mdf.hpp b/sparse/src/KokkosSparse_mdf.hpp index 90fa3beeef..672da5b4de 100644 --- a/sparse/src/KokkosSparse_mdf.hpp +++ b/sparse/src/KokkosSparse_mdf.hpp @@ -34,7 +34,7 @@ namespace KokkosSparse { namespace Experimental { template -void mdf_symbolic(crs_matrix_type& A, MDF_handle& handle) { +void mdf_symbolic(const crs_matrix_type& A, MDF_handle& handle) { using size_type = typename crs_matrix_type::size_type; using ordinal_type = typename crs_matrix_type::ordinal_type; @@ -63,7 +63,7 @@ void mdf_symbolic(crs_matrix_type& A, MDF_handle& handle) { } // mdf_symbolic template -void mdf_numeric(crs_matrix_type& A, MDF_handle& handle) { +void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: entries_type::non_const_type; using values_type = typename crs_matrix_type::values_type::non_const_type; diff --git a/sparse/src/KokkosSparse_mdf_handle.hpp b/sparse/src/KokkosSparse_mdf_handle.hpp index 6f6f2658be..03fd660b95 100644 --- a/sparse/src/KokkosSparse_mdf_handle.hpp +++ b/sparse/src/KokkosSparse_mdf_handle.hpp @@ -62,7 +62,7 @@ struct MDF_handle { crs_matrix_type L, U; - MDF_handle(const crs_matrix_type A) + MDF_handle(const crs_matrix_type& A) : numRows(A.numRows()), permutation(col_ind_type("row permutation", A.numRows())), permutation_inv(col_ind_type("inverse row permutation", A.numRows())), @@ -89,6 +89,7 @@ struct MDF_handle { } col_ind_type get_permutation() { return permutation; } + col_ind_type get_permutation_inv() { return permutation_inv; } void sort_factors() { KokkosSparse::sort_crs_matrix(L); diff --git a/sparse/src/KokkosSparse_par_ilut.hpp b/sparse/src/KokkosSparse_par_ilut.hpp index ca46976366..c7def802b9 100644 --- a/sparse/src/KokkosSparse_par_ilut.hpp +++ b/sparse/src/KokkosSparse_par_ilut.hpp @@ -20,9 +20,13 @@ /// This file provides KokkosSparse::par_ilut. This function performs a /// local (no MPI) sparse ILU(t) on matrices stored in /// compressed row sparse ("Crs") format. It is expected that symbolic -/// is called before numeric. The numeric function offers a deterministic -/// flag that will force the function to have deterministic results. This -/// is useful for testing but incurs a big performance penalty. +/// is called before numeric. The handle offers an async_update +/// flag that controls whether asynchronous updates are allowed while computing +/// L U factors. This is useful for testing as it allows for repeatable +/// (deterministic) results but may cause the algorithm to take longer (more +/// iterations) to converge. The par_ilut algorithm will repeat (iterate) until +/// max_iters is hit or the improvement in the residual from iter to iter drops +/// below a certain threshold. /// /// This algorithm is described in the paper: /// PARILUT - A New Parallel Threshold ILU Factorization - Anzt, Chow, Dongarra @@ -114,6 +118,13 @@ void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, "par_ilut_symbolic: KernelHandle and Views have different execution " "spaces."); + if (A_rowmap.extent(0) != 0) { + KK_REQUIRE_MSG(A_rowmap.extent(0) == L_rowmap.extent(0), + "L row map size does not match A row map"); + KK_REQUIRE_MSG(A_rowmap.extent(0) == U_rowmap.extent(0), + "U row map size does not match A row map"); + } + using c_size_t = typename KernelHandle::const_size_type; using c_lno_t = typename KernelHandle::const_nnz_lno_t; using c_scalar_t = typename KernelHandle::const_nnz_scalar_t; @@ -173,8 +184,7 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, AEntriesType& A_entries, AValuesType& A_values, LRowMapType& L_rowmap, LEntriesType& L_entries, LValuesType& L_values, URowMapType& U_rowmap, - UEntriesType& U_entries, UValuesType& U_values, - bool deterministic) { + UEntriesType& U_entries, UValuesType& U_values) { using size_type = typename KernelHandle::size_type; using ordinal_type = typename KernelHandle::nnz_lno_t; using scalar_type = typename KernelHandle::nnz_scalar_t; @@ -348,6 +358,11 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, KokkosKernels::Impl::throw_runtime_exception(os.str()); } + KK_REQUIRE_MSG(KokkosSparse::Impl::isCrsGraphSorted(L_rowmap, L_entries), + "L is not sorted"); + KK_REQUIRE_MSG(KokkosSparse::Impl::isCrsGraphSorted(U_rowmap, U_entries), + "U is not sorted"); + using c_size_t = typename KernelHandle::const_size_type; using c_lno_t = typename KernelHandle::const_nnz_lno_t; using c_scalar_t = typename KernelHandle::const_nnz_scalar_t; @@ -432,11 +447,16 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, KokkosSparse::Impl::PAR_ILUT_NUMERIC< const_handle_type, ARowMap_Internal, AEntries_Internal, AValues_Internal, LRowMap_Internal, LEntries_Internal, LValues_Internal, URowMap_Internal, - UEntries_Internal, - UValues_Internal>::par_ilut_numeric(&tmp_handle, A_rowmap_i, A_entries_i, - A_values_i, L_rowmap_i, L_entries_i, - L_values_i, U_rowmap_i, U_entries_i, - U_values_i, deterministic); + UEntries_Internal, UValues_Internal>::par_ilut_numeric(&tmp_handle, + A_rowmap_i, + A_entries_i, + A_values_i, + L_rowmap_i, + L_entries_i, + L_values_i, + U_rowmap_i, + U_entries_i, + U_values_i); // These may have been resized L_entries = L_entries_i; diff --git a/sparse/src/KokkosSparse_par_ilut_handle.hpp b/sparse/src/KokkosSparse_par_ilut_handle.hpp index 04f546cdd6..3ffe44ffca 100644 --- a/sparse/src/KokkosSparse_par_ilut_handle.hpp +++ b/sparse/src/KokkosSparse_par_ilut_handle.hpp @@ -24,6 +24,12 @@ namespace KokkosSparse { namespace Experimental { +/** + * Handle for par_ilut. Contains useful types, par_ilut configuration settings, + * symbolic settings and scalar output info. + * + * For more info, see KokkosSparse_par_ilut.hpp doxygen + */ template class PAR_ILUTHandle { @@ -66,41 +72,57 @@ class PAR_ILUTHandle { typename nnz_row_view_t::memory_traits>; private: - size_type nrows; - size_type nnzL; - size_type nnzU; - size_type max_iter; - nnz_scalar_t residual_norm_delta_stop; - - bool symbolic_complete; - - int team_size; - int vector_size; - - float_t fill_in_limit; + // User inputs + size_type max_iter; /// Hard cap on the number of par_ilut iterations + float_t residual_norm_delta_stop; /// When the change in residual from + /// iteration to iteration drops below + /// this, the algorithm will stop (even if + /// max_iters has not been hit) + float_t fill_in_limit; /// The threshold for the ILU factorization + bool async_update; /// Whether compute LU factors should do asychronous + /// updates. When ON, the algorithm will usually converge + /// faster but it makes the algorithm non-deterministic. + bool verbose; /// Print information while executing par_ilut + + // Stored by parent KokkosKernelsHandle + int team_size; /// Kokkos team size. Set by the parent handle. -1 implies + /// AUTO + int vector_size; /// Kokkos vector size. Set by the parent handle. + + // Stored by symbolic phase + size_type + nrows; /// Number of rows in the CSRs given to the symbolic par_ilut + size_type nnzL; /// Number of non-zero entries in the L part of A in the CSRs + /// given to the symbolic par_ilut + size_type nnzU; /// Number of non-zero entries in the U part of A in the CSRs + /// given to the symbolic par_ilut + bool symbolic_complete; /// Whether symbolic par_ilut has been called + + // Outputs + int num_iters; /// The number of iterations par_ilut took to finish + nnz_scalar_t end_rel_res; /// The A - LU residual norm at the time the + /// algorithm finished public: - PAR_ILUTHandle(const size_type nrows_, const size_type nnzL_ = 0, - const size_type nnzU_ = 0, const size_type max_iter_ = 1) - : nrows(nrows_), - nnzL(nnzL_), - nnzU(nnzU_), - max_iter(max_iter_), - residual_norm_delta_stop(0.), - symbolic_complete(false), + // See KokkosKernelsHandle::create_par_ilut_handle for default user input + // values + PAR_ILUTHandle(const size_type max_iter_, + const float_t residual_norm_delta_stop_, + const float_t fill_in_limit_, const bool async_update_, + const bool verbose_) + : max_iter(max_iter_), + residual_norm_delta_stop(residual_norm_delta_stop_), + fill_in_limit(fill_in_limit_), + async_update(async_update_), + verbose(verbose_), team_size(-1), vector_size(-1), - fill_in_limit(0.75) {} - - void reset_handle(const size_type nrows_, const size_type nnzL_, - const size_type nnzU_) { - set_nrows(nrows_); - set_nnzL(nnzL_); - set_nnzU(nnzU_); - set_residual_norm_delta_stop(0.); - reset_symbolic_complete(); - set_fill_in_limit(0.75); - } + nrows(0), + nnzL(0), + nnzU(0), + symbolic_complete(false), + num_iters(-1), + end_rel_res(-1) {} KOKKOS_INLINE_FUNCTION ~PAR_ILUTHandle() {} @@ -137,11 +159,10 @@ class PAR_ILUTHandle { void set_max_iter(const size_type max_iter_) { this->max_iter = max_iter_; } int get_max_iter() const { return this->max_iter; } - void set_residual_norm_delta_stop( - const nnz_scalar_t residual_norm_delta_stop_) { + void set_residual_norm_delta_stop(const float_t residual_norm_delta_stop_) { this->residual_norm_delta_stop = residual_norm_delta_stop_; } - nnz_scalar_t get_residual_norm_delta_stop() const { + float_t get_residual_norm_delta_stop() const { return this->residual_norm_delta_stop; } @@ -150,6 +171,16 @@ class PAR_ILUTHandle { } float_t get_fill_in_limit() const { return this->fill_in_limit; } + bool get_verbose() const { return verbose; } + + void set_verbose(const bool verbose_) { this->verbose = verbose_; } + + bool get_async_update() const { return async_update; } + + void set_async_update(const bool async_update_) { + this->async_update = async_update_; + } + TeamPolicy get_default_team_policy() const { if (team_size == -1) { return TeamPolicy(nrows, Kokkos::AUTO); @@ -157,6 +188,15 @@ class PAR_ILUTHandle { return TeamPolicy(nrows, team_size); } } + + int get_num_iters() const { return num_iters; } + + nnz_scalar_t get_end_rel_res() const { return end_rel_res; } + + void set_stats(int num_iters_, nnz_scalar_t end_rel_res_) { + num_iters = num_iters_; + end_rel_res = end_rel_res_; + } }; } // namespace Experimental diff --git a/sparse/src/KokkosSparse_spgemm_symbolic.hpp b/sparse/src/KokkosSparse_spgemm_symbolic.hpp index ff37199699..486d999e41 100644 --- a/sparse/src/KokkosSparse_spgemm_symbolic.hpp +++ b/sparse/src/KokkosSparse_spgemm_symbolic.hpp @@ -140,15 +140,26 @@ void spgemm_symbolic(KernelHandle *handle, // Verify that graphs A and B are sorted. // This test is designed to be as efficient as possible, but still skip // it in a release build. + // + // Temporary fix for Trilinos issue #11655: Only perform this check if a TPL + // is to be called. The KokkosKernels (non-TPL) implementation does not + // actually require sorted indices yet. And Tpetra uses size_type = size_t, so + // it will (currently) not be calling a TPL path. #ifndef NDEBUG - if (!KokkosSparse::Impl::isCrsGraphSorted(const_a_r, const_a_l)) - throw std::runtime_error( - "KokkosSparse::spgemm_symbolic: entries of A are not sorted within " - "rows. May use KokkosSparse::sort_crs_matrix to sort it."); - if (!KokkosSparse::Impl::isCrsGraphSorted(const_b_r, const_b_l)) - throw std::runtime_error( - "KokkosSparse::spgemm_symbolic: entries of B are not sorted within " - "rows. May use KokkosSparse::sort_crs_matrix to sort it."); + if constexpr (KokkosSparse::Impl::spgemm_symbolic_tpl_spec_avail< + const_handle_type, Internal_alno_row_view_t_, + Internal_alno_nnz_view_t_, Internal_blno_row_view_t_, + Internal_blno_nnz_view_t_, + Internal_clno_row_view_t_>::value) { + if (!KokkosSparse::Impl::isCrsGraphSorted(const_a_r, const_a_l)) + throw std::runtime_error( + "KokkosSparse::spgemm_symbolic: entries of A are not sorted within " + "rows. May use KokkosSparse::sort_crs_matrix to sort it."); + if (!KokkosSparse::Impl::isCrsGraphSorted(const_b_r, const_b_l)) + throw std::runtime_error( + "KokkosSparse::spgemm_symbolic: entries of B are not sorted within " + "rows. May use KokkosSparse::sort_crs_matrix to sort it."); + } #endif auto algo = tmp_handle.get_spgemm_handle()->get_algorithm_type(); diff --git a/sparse/src/KokkosSparse_sptrsv_handle.hpp b/sparse/src/KokkosSparse_sptrsv_handle.hpp index 049ab14b1e..7c9027d24a 100644 --- a/sparse/src/KokkosSparse_sptrsv_handle.hpp +++ b/sparse/src/KokkosSparse_sptrsv_handle.hpp @@ -138,7 +138,7 @@ class SPTRSVHandle { cusparseSpSVDescr_t spsvDescr; void *pBuffer{nullptr}; - cuSparseHandleType(bool transpose_, bool is_lower) { + cuSparseHandleType(bool transpose_, bool /*is_lower*/) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreate(&handle)); KOKKOS_CUSPARSE_SAFE_CALL( diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp index 50555d9815..5f555f926e 100644 --- a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp @@ -45,7 +45,7 @@ template void spgemm_numeric_cusparse( - KernelHandle *handle, lno_t m, lno_t n, lno_t k, + KernelHandle *handle, lno_t /*m*/, lno_t /*n*/, lno_t /*k*/, const ConstRowMapType &row_mapA, const ConstEntriesType &entriesA, const ConstValuesType &valuesA, const ConstRowMapType &row_mapB, const ConstEntriesType &entriesB, const ConstValuesType &valuesB, @@ -519,7 +519,6 @@ void spgemm_numeric_mkl( const_cast(colidxB.data()), const_cast(valuesB.data())); auto mklSpgemmHandle = handle->get_mkl_spgemm_handle(); - bool computedEntries = false; matrix_descr generalDescr; generalDescr.type = SPARSE_MATRIX_TYPE_GENERAL; generalDescr.mode = SPARSE_FILL_MODE_FULL; diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index 28a80ec266..717c62b985 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -151,7 +151,8 @@ void spmv_mv_cusparse(const KokkosKernels::Experimental::Controls &controls, cusparseOperation_t opB = xIsLL ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; -#if CUDA_VERSION < 12000 +// CUSPARSE_MM_ALG_DEFAULT was deprecated as early as 11.1 (maybe earlier) +#if CUSPARSE_VERSION < 11010 const cusparseSpMMAlg_t alg = CUSPARSE_MM_ALG_DEFAULT; #else const cusparseSpMMAlg_t alg = CUSPARSE_SPMM_ALG_DEFAULT; diff --git a/sparse/unit_test/Test_Sparse_gmres.hpp b/sparse/unit_test/Test_Sparse_gmres.hpp index 6b1265dabd..1990087526 100644 --- a/sparse/unit_test/Test_Sparse_gmres.hpp +++ b/sparse/unit_test/Test_Sparse_gmres.hpp @@ -86,14 +86,13 @@ void run_test_gmres() { ViewVectorType Wj("Wj", n); // For checking residuals at end. ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), n); // right-hand side vec + // Make rhs ones so that results are repeatable: + Kokkos::deep_copy(B, 1.0); gmres_handle->set_verbose(verbose); // Test CGS2 { - // Make rhs ones so that results are repeatable: - Kokkos::deep_copy(B, 1.0); - gmres(&kh, A, B, X); // Double check residuals at end of solve: @@ -137,12 +136,12 @@ void run_test_gmres() { gmres_handle->set_verbose(verbose); // Make precond - auto myPrec = new KokkosSparse::Experimental::MatrixPrec(A); + KokkosSparse::Experimental::MatrixPrec myPrec(A); // reset X for next gmres call Kokkos::deep_copy(X, 0.0); - gmres(&kh, A, B, X, myPrec); + gmres(&kh, A, B, X, &myPrec); // Double check residuals at end of solve: float_t nrmB = KokkosBlas::nrm2(B); @@ -154,8 +153,6 @@ void run_test_gmres() { EXPECT_LT(endRes, gmres_handle->get_tol()); EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); - - delete myPrec; } } diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index 0f73b80f3f..9b99c1000d 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -25,6 +25,9 @@ #include "KokkosBlas1_nrm2.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosSparse_par_ilut.hpp" +#include "KokkosSparse_gmres.hpp" +#include "KokkosSparse_LUPrec.hpp" +#include "KokkosSparse_SortCrs.hpp" #include @@ -35,6 +38,20 @@ using namespace KokkosKernels::Experimental; namespace Test { +namespace ParIlut { + +template +struct TolMeta { + static constexpr T value = 1e-8; +}; + +template <> +struct TolMeta { + static constexpr float value = 1e-5; // Lower tolerance for floats +}; + +} // namespace ParIlut + template std::vector> decompress_matrix( @@ -159,9 +176,10 @@ void run_test_par_ilut() { // Make kernel handle KernelHandle kh; - kh.create_par_ilut_handle(nrows); + kh.create_par_ilut_handle(); auto par_ilut_handle = kh.get_par_ilut_handle(); + par_ilut_handle->set_async_update(false); // Allocate L and U CRS views as outputs RowMapType L_row_map("L_row_map", nrows + 1); @@ -173,8 +191,8 @@ void run_test_par_ilut() { const size_type nnzL = par_ilut_handle->get_nnzL(); const size_type nnzU = par_ilut_handle->get_nnzU(); - EXPECT_EQ(nnzL, 10); - EXPECT_EQ(nnzU, 8); + ASSERT_EQ(nnzL, 10); + ASSERT_EQ(nnzU, 8); EntriesType L_entries("L_entries", nnzL); ValuesType L_values("L_values", nnzL); @@ -182,13 +200,7 @@ void run_test_par_ilut() { ValuesType U_values("U_values", nnzU); par_ilut_numeric(&kh, row_map, entries, values, L_row_map, L_entries, - L_values, U_row_map, U_entries, U_values, -#ifdef KOKKOS_ENABLE_SERIAL - true /*deterministic*/ -#else - false /*cannot ask for determinism*/ -#endif - ); + L_values, U_row_map, U_entries, U_values); // Use this to check LU // std::vector > expected_LU = { @@ -242,10 +254,6 @@ void run_test_par_ilut() { // check_matrix("U numeric", U_row_map, U_entries, U_values, // expected_U_candidates); - // Serial is required for deterministic mode and the checks below cannot - // reliably pass without determinism. -#ifdef KOKKOS_ENABLE_SERIAL - // Use these fixtures to test full numeric std::vector> expected_L_candidates = { {1., 0., 0., 0.}, @@ -266,10 +274,199 @@ void run_test_par_ilut() { check_matrix("U numeric", U_row_map, U_entries, U_values, expected_U_candidates); - // Checking + kh.destroy_par_ilut_handle(); +} + +template +void run_test_par_ilut_precond() { + // Test using par_ilut as a preconditioner + // Does (LU)^inv Ax = (LU)^inv b converge faster than solving Ax=b? + using exe_space = typename device::execution_space; + using mem_space = typename device::memory_space; + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using sp_matrix_type = + KokkosSparse::CrsMatrix; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, exe_space, mem_space, mem_space>; + using float_t = typename Kokkos::ArithTraits::mag_type; + + // Create a diagonally dominant sparse matrix to test: + // par_ilut settings max_iters, res_delta_stop, fill_in_limit, and + // async_update are all left as defaults + constexpr auto n = 5000; + constexpr auto m = 15; + constexpr auto tol = ParIlut::TolMeta::value; + constexpr auto numRows = n; + constexpr auto numCols = n; + constexpr auto diagDominance = 1; + constexpr bool verbose = false; + + typename sp_matrix_type::non_const_size_type nnz = 10 * numRows; + auto A = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< + sp_matrix_type>(numRows, numCols, nnz, 0, lno_t(0.01 * numRows), + diagDominance); + + KokkosSparse::sort_crs_matrix(A); + + // Make kernel handles + KernelHandle kh; + kh.create_gmres_handle(m, tol); + auto gmres_handle = kh.get_gmres_handle(); + gmres_handle->set_verbose(verbose); + using GMRESHandle = + typename std::remove_reference::type; + using ViewVectorType = typename GMRESHandle::nnz_value_view_t; + + kh.create_par_ilut_handle(); + auto par_ilut_handle = kh.get_par_ilut_handle(); + par_ilut_handle->set_verbose(verbose); + + // Pull out views from CRS + auto row_map = A.graph.row_map; + auto entries = A.graph.entries; + auto values = A.values; + + // Allocate L and U CRS views as outputs + RowMapType L_row_map("L_row_map", numRows + 1); + RowMapType U_row_map("U_row_map", numRows + 1); + + // Initial L/U approximations for A + par_ilut_symbolic(&kh, row_map, entries, L_row_map, U_row_map); + + const size_type nnzL = par_ilut_handle->get_nnzL(); + const size_type nnzU = par_ilut_handle->get_nnzU(); + + EntriesType L_entries("L_entries", nnzL); + ValuesType L_values("L_values", nnzL); + EntriesType U_entries("U_entries", nnzU); + ValuesType U_values("U_values", nnzU); + + par_ilut_numeric(&kh, row_map, entries, values, L_row_map, L_entries, + L_values, U_row_map, U_entries, U_values); + + // Create CRSs + sp_matrix_type L("L", numRows, numCols, L_values.extent(0), L_values, + L_row_map, L_entries), + U("U", numRows, numCols, U_values.extent(0), U_values, U_row_map, + U_entries); + + // Set initial vectors: + ViewVectorType X("X", n); // Solution and initial guess + ViewVectorType Wj("Wj", n); // For checking residuals at end. + ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), + n); // right-hand side vec + // Make rhs ones so that results are repeatable: + Kokkos::deep_copy(B, 1.0); + + int num_iters_plain(0), num_iters_precond(0); + + // Solve Ax = b + { + gmres(&kh, A, B, X); + + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; + + const auto conv_flag = gmres_handle->get_conv_flag_val(); + num_iters_plain = gmres_handle->get_num_iters(); + + EXPECT_GT(num_iters_plain, 0); + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + } + + // Solve Ax = b with LU preconditioner. + { + gmres_handle->reset_handle(m, tol); + gmres_handle->set_verbose(verbose); + + // Make precond + KokkosSparse::Experimental::LUPrec myPrec(L, + U); + + // reset X for next gmres call + Kokkos::deep_copy(X, 0.0); + + gmres(&kh, A, B, X, &myPrec); + + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; + + const auto conv_flag = gmres_handle->get_conv_flag_val(); + num_iters_precond = gmres_handle->get_num_iters(); + + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + EXPECT_LT(num_iters_precond, num_iters_plain); + } +} + +template +void run_test_par_ilut_zerorow_A() { + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, typename device::execution_space, + typename device::memory_space, typename device::memory_space>; + + const size_type nrows = 0; + + // Allocate device CRS views for A + RowMapType row_map("row_map", 0); + EntriesType entries("entries", 0); + ValuesType values("values", 0); + + // Create host mirror views for CRS A + auto hrow_map = Kokkos::create_mirror_view(row_map); + auto hentries = Kokkos::create_mirror_view(entries); + auto hvalues = Kokkos::create_mirror_view(values); + + // Make kernel handle + KernelHandle kh; + + kh.create_par_ilut_handle(); + + auto par_ilut_handle = kh.get_par_ilut_handle(); + + // Allocate L and U CRS views as outputs + RowMapType L_row_map("L_row_map", nrows + 1); + RowMapType U_row_map("U_row_map", nrows + 1); + + // Initial L/U approximations for A + par_ilut_symbolic(&kh, row_map, entries, L_row_map, U_row_map); + + const size_type nnzL = par_ilut_handle->get_nnzL(); + const size_type nnzU = par_ilut_handle->get_nnzU(); + + ASSERT_EQ(nnzL, 0); + ASSERT_EQ(nnzU, 0); + + EntriesType L_entries("L_entries", nnzL); + ValuesType L_values("L_values", nnzL); + EntriesType U_entries("U_entries", nnzU); + ValuesType U_values("U_values", nnzU); + + par_ilut_numeric(&kh, row_map, entries, values, L_row_map, L_entries, + L_values, U_row_map, U_entries, U_values); + + const auto itrs = par_ilut_handle->get_num_iters(); + const auto end_rel_res = par_ilut_handle->get_end_rel_res(); + + EXPECT_EQ(itrs, 0); + EXPECT_EQ(end_rel_res, scalar_t(0.)); kh.destroy_par_ilut_handle(); -#endif } } // namespace Test @@ -280,10 +477,32 @@ void test_par_ilut() { Test::run_test_par_ilut(); } -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, \ - sparse##_##par_ilut##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_par_ilut(); \ +template +void test_par_ilut_precond() { + Test::run_test_par_ilut_precond(); +} + +template +void test_par_ilut_zerorow_A() { + Test::run_test_par_ilut_zerorow_A(); +} + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, \ + sparse##_##par_ilut##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_par_ilut(); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##par_ilut_zerorow_A##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_par_ilut_zerorow_A(); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##par_ilut_precond##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_par_ilut_precond(); \ } #define NO_TEST_COMPLEX diff --git a/sparse/unit_test/Test_Sparse_rocsparse.hpp b/sparse/unit_test/Test_Sparse_rocsparse.hpp index f4c9e4e741..804c777daa 100644 --- a/sparse/unit_test/Test_Sparse_rocsparse.hpp +++ b/sparse/unit_test/Test_Sparse_rocsparse.hpp @@ -66,6 +66,7 @@ void test_rocsparse_safe_call() { void test_rocsparse_singleton() { KokkosKernels::Impl::RocsparseSingleton& s = KokkosKernels::Impl::RocsparseSingleton::singleton(); + (void)s; } TEST_F(TestCategory, sparse_rocsparse_version) { test_rocsparse_version(); }