From 69e1eed85dcadb7094fa125f8c493db41b5381f0 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 24 May 2024 12:12:34 +0200 Subject: [PATCH 001/448] add Windows gitlab job --- .gitlab-ci.yml | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f3cecee4b71..09ec11dab68 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -622,6 +622,41 @@ build/icpx/igpu/release/static: ONEAPI_DEVICE_SELECTOR: "*:gpu" BUILD_HWLOC: "OFF" +# windows jobs: Release shared +# Note that this is using Powershell, not bash +build/windows/release/shared: + stage: build + script: + - if (Test-Path build) { rm -r -fo build } + - if (Test-Path install) { rm -r -fo install } + - mkdir build + - mkdir install + - cmake -B build -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_CUDA=OFF "-DCMAKE_INSTALL_PREFIX=$pwd\install" . + - cmake --build build --config Release -j16 + - ctest --test-dir build -C Release --no-tests=error --output-on-failure -j16 + - $env:PATH+=";$pwd/install/bin" + - cmake --install build --config Release + - cmake --build build --target test_install --config Release + tags: + - windows + +# CUDA +build/windows-cuda/release/shared: + stage: build + script: + - if (Test-Path build) { rm -r -fo build } + - if (Test-Path install) { rm -r -fo install } + - mkdir build + - mkdir install + - cmake -B build -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_CUDA=ON "-DCMAKE_INSTALL_PREFIX=$pwd\install" . + - cmake --build build --config Release -j16 + - ctest --test-dir build -C Release --no-tests=error --output-on-failure + - $env:PATH+=";$pwd/install/bin" + - cmake --install build --config Release + - cmake --build build --target test_install --config Release + tags: + - windows-cuda + # Job with important warnings as error warnings: stage: code_quality From 68048dad25690edde3e490f1917b1c9d6ca9dcd3 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 26 Jun 2024 18:21:11 +0200 Subject: [PATCH 002/448] disable Github actions for Windows --- .github/workflows/windows-msvc-cuda.yml | 1 + .github/workflows/windows-msvc-ref.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/windows-msvc-cuda.yml b/.github/workflows/windows-msvc-cuda.yml index b1df1aaf4ed..efa637b2bf9 100644 --- a/.github/workflows/windows-msvc-cuda.yml +++ b/.github/workflows/windows-msvc-cuda.yml @@ -23,6 +23,7 @@ concurrency: jobs: windows_cuda: + if: ${{ false }} strategy: fail-fast: false matrix: diff --git a/.github/workflows/windows-msvc-ref.yml b/.github/workflows/windows-msvc-ref.yml index 117262b2016..60a811bb99b 100644 --- a/.github/workflows/windows-msvc-ref.yml +++ b/.github/workflows/windows-msvc-ref.yml @@ -23,6 +23,7 @@ concurrency: jobs: windows_ref: + if: ${{ false }} strategy: fail-fast: false matrix: From 84f34e630c3ad7f07ff220eb4e83542c47cdbdfb Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 26 Jun 2024 18:37:51 +0200 Subject: [PATCH 003/448] remove duplicate job runs --- .gitlab-ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 09ec11dab68..4a7860d263a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -625,6 +625,8 @@ build/icpx/igpu/release/static: # windows jobs: Release shared # Note that this is using Powershell, not bash build/windows/release/shared: + extends: + - .quick_test_condition stage: build script: - if (Test-Path build) { rm -r -fo build } @@ -642,6 +644,8 @@ build/windows/release/shared: # CUDA build/windows-cuda/release/shared: + extends: + - .quick_test_condition stage: build script: - if (Test-Path build) { rm -r -fo build } From 1a2ee540e707ebd9a2c6e8678f4f29fe7d4b0080 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 26 Jun 2024 18:53:52 +0200 Subject: [PATCH 004/448] work around intel timer issues --- test/base/timer.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/base/timer.cpp b/test/base/timer.cpp index 551e8b4b955..a817ddeef96 100644 --- a/test/base/timer.cpp +++ b/test/base/timer.cpp @@ -41,9 +41,15 @@ TEST_F(Timer, WorksAsync) auto timer = gko::Timer::create_for_executor(this->exec); auto start = timer->create_time_point(); auto stop = timer->create_time_point(); + gko::array dummy{this->exec, {0}}; + auto dummy2 = dummy; + this->exec->synchronize(); + // we do some minimal work to work around Intel GPU timers running backwards timer->record(start); + dummy = dummy2; std::this_thread::sleep_for(std::chrono::seconds{5}); + dummy = dummy2; timer->record(stop); timer->wait(stop); @@ -56,9 +62,15 @@ TEST_F(Timer, Works) auto timer = gko::Timer::create_for_executor(this->exec); auto start = timer->create_time_point(); auto stop = timer->create_time_point(); + gko::array dummy{this->exec, {0}}; + auto dummy2 = dummy; + this->exec->synchronize(); + // we do some minimal work to work around Intel GPU timers running backwards timer->record(start); + dummy = dummy2; std::this_thread::sleep_for(std::chrono::seconds{5}); + dummy = dummy2; timer->record(stop); ASSERT_GT(timer->difference(start, stop), std::chrono::seconds{1}); From afaaf9b3d41109e44ac928e64a72849e26ea141b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 27 Jun 2024 16:10:46 +0200 Subject: [PATCH 005/448] disable tests, run CUDA first --- .gitlab-ci.yml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4a7860d263a..1866f16406a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -622,9 +622,9 @@ build/icpx/igpu/release/static: ONEAPI_DEVICE_SELECTOR: "*:gpu" BUILD_HWLOC: "OFF" -# windows jobs: Release shared +# windows jobs # Note that this is using Powershell, not bash -build/windows/release/shared: +build/windows-cuda/release/shared: extends: - .quick_test_condition stage: build @@ -633,17 +633,17 @@ build/windows/release/shared: - if (Test-Path install) { rm -r -fo install } - mkdir build - mkdir install - - cmake -B build -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_CUDA=OFF "-DCMAKE_INSTALL_PREFIX=$pwd\install" . + - cmake -B build -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_CUDA=ON "-DCMAKE_INSTALL_PREFIX=$pwd\install" . - cmake --build build --config Release -j16 - - ctest --test-dir build -C Release --no-tests=error --output-on-failure -j16 +# we disable these tests until the triangular solver issues are resolved +# - ctest --test-dir build -C Release --no-tests=error --output-on-failure - $env:PATH+=";$pwd/install/bin" - cmake --install build --config Release - cmake --build build --target test_install --config Release tags: - - windows + - windows-cuda -# CUDA -build/windows-cuda/release/shared: +build/windows/release/shared: extends: - .quick_test_condition stage: build @@ -652,14 +652,14 @@ build/windows-cuda/release/shared: - if (Test-Path install) { rm -r -fo install } - mkdir build - mkdir install - - cmake -B build -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_CUDA=ON "-DCMAKE_INSTALL_PREFIX=$pwd\install" . + - cmake -B build -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_CUDA=OFF "-DCMAKE_INSTALL_PREFIX=$pwd\install" . - cmake --build build --config Release -j16 - - ctest --test-dir build -C Release --no-tests=error --output-on-failure + - ctest --test-dir build -C Release --no-tests=error --output-on-failure -j16 - $env:PATH+=";$pwd/install/bin" - cmake --install build --config Release - cmake --build build --target test_install --config Release tags: - - windows-cuda + - windows # Job with important warnings as error warnings: From 412756ae5e6ebd76aa7dd803c6a5aaa01e280c9e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 21 May 2024 17:01:37 +0200 Subject: [PATCH 006/448] replace EXEC_NAMESPACE by GKO_DEVICE_NAMESPACE --- test/base/batch_multi_vector_kernels.cpp | 8 ++-- test/base/executor.cpp | 4 +- test/base/index_range.cpp | 2 +- test/base/kernel_launch_generic.cpp | 36 ++++++++--------- test/components/absolute_array_kernels.cpp | 8 ++-- test/components/fill_array_kernels.cpp | 4 +- test/components/format_conversion_kernels.cpp | 10 ++--- test/components/prefix_sum_kernels.cpp | 8 ++-- test/components/reduce_array_kernels.cpp | 2 +- test/distributed/index_map_kernels.cpp | 16 ++++---- test/distributed/matrix_kernels.cpp | 2 +- test/distributed/partition_helper_kernels.cpp | 22 +++++----- test/distributed/vector_kernels.cpp | 2 +- test/factorization/cholesky_kernels.cpp | 16 ++++---- test/factorization/lu_kernels.cpp | 8 ++-- test/factorization/par_ic_kernels.cpp | 4 +- test/factorization/par_ict_kernels.cpp | 8 ++-- test/factorization/par_ilu_kernels.cpp | 14 +++---- test/factorization/par_ilut_kernels.cpp | 23 ++++++----- test/matrix/csr_kernels.cpp | 6 +-- test/matrix/csr_kernels2.cpp | 15 +++---- test/matrix/dense_kernels.cpp | 10 ++--- test/matrix/ell_kernels.cpp | 2 +- test/matrix/sparsity_csr_kernels.cpp | 6 +-- test/multigrid/pgm_kernels.cpp | 21 +++++----- test/preconditioner/batch_jacobi_kernels.cpp | 2 +- test/preconditioner/isai_kernels.cpp | 40 +++++++++---------- test/solver/batch_bicgstab_kernels.cpp | 2 +- test/solver/batch_cg_kernels.cpp | 2 +- test/solver/bicg_kernels.cpp | 6 +-- test/solver/bicgstab_kernels.cpp | 8 ++-- test/solver/cb_gmres_kernels.cpp | 8 ++-- test/solver/cg_kernels.cpp | 14 +++---- test/solver/cgs_kernels.cpp | 8 ++-- test/solver/fcg_kernels.cpp | 10 ++--- test/solver/gcr_kernels.cpp | 6 +-- test/solver/gmres_kernels.cpp | 12 +++--- test/solver/idr_kernels.cpp | 14 +++---- test/solver/ir_kernels.cpp | 2 +- test/solver/multigrid_kernels.cpp | 8 ++-- 40 files changed, 201 insertions(+), 198 deletions(-) diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp index ab15e1a99a3..07749d9bed2 100644 --- a/test/base/batch_multi_vector_kernels.cpp +++ b/test/base/batch_multi_vector_kernels.cpp @@ -312,8 +312,8 @@ TEST_F(MultiVector, CopySingleIsEquivalentToRef) gko::kernels::reference::batch_multi_vector::copy(this->ref, x.get(), y.get()); - gko::kernels::EXEC_NAMESPACE::batch_multi_vector::copy(this->exec, dx.get(), - dy.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_multi_vector::copy( + this->exec, dx.get(), dy.get()); GKO_ASSERT_BATCH_MTX_NEAR(dy, y, 0.0); } @@ -325,8 +325,8 @@ TEST_F(MultiVector, CopyIsEquivalentToRef) gko::kernels::reference::batch_multi_vector::copy(this->ref, x.get(), y.get()); - gko::kernels::EXEC_NAMESPACE::batch_multi_vector::copy(this->exec, dx.get(), - dy.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_multi_vector::copy( + this->exec, dx.get(), dy.get()); GKO_ASSERT_BATCH_MTX_NEAR(dy, y, 0.0); } diff --git a/test/base/executor.cpp b/test/base/executor.cpp index 8ea3b01fb24..541360d01d4 100644 --- a/test/base/executor.cpp +++ b/test/base/executor.cpp @@ -72,7 +72,7 @@ TEST_F(Executor, RunsCorrectOperation) exec->run(ExampleOperation(value)); - ASSERT_EQ(EXEC_NAMESPACE::value, value); + ASSERT_EQ(GKO_DEVICE_NAMESPACE::value, value); } @@ -104,7 +104,7 @@ TEST_F(Executor, RunsCorrectLambdaOperation) exec->run(omp_lambda, cuda_lambda, hip_lambda, dpcpp_lambda); - ASSERT_EQ(EXEC_NAMESPACE::value, value); + ASSERT_EQ(GKO_DEVICE_NAMESPACE::value, value); } diff --git a/test/base/index_range.cpp b/test/base/index_range.cpp index 044202fd8e2..b16b5fb9046 100644 --- a/test/base/index_range.cpp +++ b/test/base/index_range.cpp @@ -30,7 +30,7 @@ class IndexRange : public CommonTestFixture { void run_range_for(std::shared_ptr exec, gko::array& result_array) { - gko::kernels::EXEC_NAMESPACE::run_kernel( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( exec, [] GKO_KERNEL(auto i, auto result, auto size) { for (auto i : gko::irange{size}) { diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp index 55e1268a77a..c746a5b3461 100644 --- a/test/base/kernel_launch_generic.cpp +++ b/test/base/kernel_launch_generic.cpp @@ -46,7 +46,7 @@ move_only_type move_only_val{}; namespace gko { namespace kernels { -namespace EXEC_NAMESPACE { +namespace GKO_DEVICE_NAMESPACE { template <> @@ -57,7 +57,7 @@ struct to_device_type_impl { }; -} // namespace EXEC_NAMESPACE +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko @@ -108,7 +108,7 @@ class KernelLaunch : public CommonTestFixture { // nvcc doesn't like device lambdas declared in complex classes, move it out void run1d(std::shared_ptr exec, size_type dim, int* data) { - gko::kernels::EXEC_NAMESPACE::run_kernel( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto dummy) { static_assert(is_same::value, "index"); @@ -129,7 +129,7 @@ TEST_F(KernelLaunch, Runs1D) void run1d(std::shared_ptr exec, gko::array& data) { - gko::kernels::EXEC_NAMESPACE::run_kernel( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d_ptr, auto dummy) { static_assert(is_same::value, "index"); @@ -155,7 +155,7 @@ TEST_F(KernelLaunch, Runs1DArray) void run1d(std::shared_ptr exec, Mtx* m) { - gko::kernels::EXEC_NAMESPACE::run_kernel( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr, auto dummy) { static_assert(is_same::value, "index"); @@ -193,7 +193,7 @@ TEST_F(KernelLaunch, Runs1DDense) void run2d(std::shared_ptr exec, int* data) { - gko::kernels::EXEC_NAMESPACE::run_kernel( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d, auto dummy) { static_assert(is_same::value, "index"); @@ -215,7 +215,7 @@ TEST_F(KernelLaunch, Runs2D) void run2d(std::shared_ptr exec, gko::array& data) { - gko::kernels::EXEC_NAMESPACE::run_kernel( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr, auto dummy) { static_assert(is_same::value, "index"); @@ -242,7 +242,7 @@ TEST_F(KernelLaunch, Runs2DArray) void run2d(std::shared_ptr exec, Mtx* m1, Mtx* m2, Mtx* m3) { - gko::kernels::EXEC_NAMESPACE::run_kernel_solver( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_solver( exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3, auto d4, auto d2_ptr, auto d3_ptr, auto dummy) { @@ -280,8 +280,8 @@ void run2d(std::shared_ptr exec, Mtx* m1, Mtx* m2, Mtx* m3) }, dim<2>{4, 4}, m2->get_stride(), m1, static_cast(m1), m1->get_const_values(), - gko::kernels::EXEC_NAMESPACE::default_stride(m2), - gko::kernels::EXEC_NAMESPACE::row_vector(m3), m2->get_values(), + gko::kernels::GKO_DEVICE_NAMESPACE::default_stride(m2), + gko::kernels::GKO_DEVICE_NAMESPACE::row_vector(m3), m2->get_values(), m3->get_values(), move_only_val); } @@ -297,7 +297,7 @@ void run1d_reduction(std::shared_ptr exec) { gko::array output{exec, {-1l}}; auto run_reduction = [&](int64 init, size_type size) { - gko::kernels::EXEC_NAMESPACE::run_kernel_reduction( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_reduction( exec, [] GKO_KERNEL(auto i, auto a, auto dummy) { static_assert(is_same::value, "index"); @@ -343,7 +343,7 @@ void run1d_reduction_cached(std::shared_ptr exec, gko::array temp(exec); for (const auto& size : sizes) { temp.clear(); - gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_reduction_cached( exec, [] GKO_KERNEL(auto i) { return i + 1; }, [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), @@ -366,7 +366,7 @@ void run2d_reduction(std::shared_ptr exec) { gko::array output{exec, {-1l}}; auto run_reduction = [&](int64 init, gko::dim<2> size) { - gko::kernels::EXEC_NAMESPACE::run_kernel_reduction( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_reduction( exec, [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); @@ -435,7 +435,7 @@ void run2d_reduction_cached(std::shared_ptr exec, gko::array temp(exec); for (const auto& dim : dims) { temp.clear(); - gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_reduction_cached( exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; }, [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), @@ -482,7 +482,7 @@ void run2d_row_reduction(std::shared_ptr exec) static_cast(num_cols) * (num_cols + 1) * (i + 1); } - gko::kernels::EXEC_NAMESPACE::run_kernel_row_reduction( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_row_reduction( exec, [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); @@ -527,7 +527,7 @@ void run2d_row_reduction_cached(std::shared_ptr exec, host_ref.get_data()[i] = dim[1] + i + 1; } - gko::kernels::EXEC_NAMESPACE::run_kernel_row_reduction_cached( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_row_reduction_cached( exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; }, [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), @@ -576,7 +576,7 @@ void run2d_col_reduction(std::shared_ptr exec) static_cast(num_rows) * (num_rows + 1) * (i + 1); } - gko::kernels::EXEC_NAMESPACE::run_kernel_col_reduction( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_col_reduction( exec, [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); @@ -620,7 +620,7 @@ void run2d_col_reduction_cached(std::shared_ptr exec, host_ref.get_data()[i] = dim[0] + i + 1; } - gko::kernels::EXEC_NAMESPACE::run_kernel_col_reduction_cached( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_col_reduction_cached( exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; }, [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), diff --git a/test/components/absolute_array_kernels.cpp b/test/components/absolute_array_kernels.cpp index 6e00ad6e185..08dd52f35e3 100644 --- a/test/components/absolute_array_kernels.cpp +++ b/test/components/absolute_array_kernels.cpp @@ -46,7 +46,7 @@ class AbsoluteArray : public CommonTestFixture { TEST_F(AbsoluteArray, InplaceEqualsReference) { - gko::kernels::EXEC_NAMESPACE::components::inplace_absolute_array( + gko::kernels::GKO_DEVICE_NAMESPACE::components::inplace_absolute_array( exec, dvals.get_data(), total_size); gko::kernels::reference::components::inplace_absolute_array( ref, vals.get_data(), total_size); @@ -57,7 +57,7 @@ TEST_F(AbsoluteArray, InplaceEqualsReference) TEST_F(AbsoluteArray, InplaceComplexEqualsReference) { - gko::kernels::EXEC_NAMESPACE::components::inplace_absolute_array( + gko::kernels::GKO_DEVICE_NAMESPACE::components::inplace_absolute_array( exec, dcomplex_vals.get_data(), total_size); gko::kernels::reference::components::inplace_absolute_array( ref, complex_vals.get_data(), total_size); @@ -71,7 +71,7 @@ TEST_F(AbsoluteArray, OutplaceEqualsReference) gko::array abs_vals(ref, total_size); gko::array dabs_vals(exec, total_size); - gko::kernels::EXEC_NAMESPACE::components::outplace_absolute_array( + gko::kernels::GKO_DEVICE_NAMESPACE::components::outplace_absolute_array( exec, dvals.get_const_data(), total_size, dabs_vals.get_data()); gko::kernels::reference::components::outplace_absolute_array( ref, vals.get_const_data(), total_size, abs_vals.get_data()); @@ -85,7 +85,7 @@ TEST_F(AbsoluteArray, OutplaceComplexEqualsReference) gko::array abs_vals(ref, total_size); gko::array dabs_vals(exec, total_size); - gko::kernels::EXEC_NAMESPACE::components::outplace_absolute_array( + gko::kernels::GKO_DEVICE_NAMESPACE::components::outplace_absolute_array( exec, dcomplex_vals.get_const_data(), total_size, dabs_vals.get_data()); gko::kernels::reference::components::outplace_absolute_array( ref, complex_vals.get_const_data(), total_size, abs_vals.get_data()); diff --git a/test/components/fill_array_kernels.cpp b/test/components/fill_array_kernels.cpp index 9ccf63e5c88..3997c5830ea 100644 --- a/test/components/fill_array_kernels.cpp +++ b/test/components/fill_array_kernels.cpp @@ -47,7 +47,7 @@ TYPED_TEST_SUITE(FillArray, gko::test::ValueAndIndexTypes, TYPED_TEST(FillArray, EqualsReference) { using T = typename TestFixture::value_type; - gko::kernels::EXEC_NAMESPACE::components::fill_array( + gko::kernels::GKO_DEVICE_NAMESPACE::components::fill_array( this->exec, this->dvals.get_data(), this->total_size, T(1523)); GKO_ASSERT_ARRAY_EQ(this->vals, this->dvals); @@ -57,7 +57,7 @@ TYPED_TEST(FillArray, EqualsReference) TYPED_TEST(FillArray, FillSeqEqualsReference) { using T = typename TestFixture::value_type; - gko::kernels::EXEC_NAMESPACE::components::fill_seq_array( + gko::kernels::GKO_DEVICE_NAMESPACE::components::fill_seq_array( this->exec, this->dvals.get_data(), this->total_size); GKO_ASSERT_ARRAY_EQ(this->seqs, this->dvals); diff --git a/test/components/format_conversion_kernels.cpp b/test/components/format_conversion_kernels.cpp index fee77ea5986..053171ffbe2 100644 --- a/test/components/format_conversion_kernels.cpp +++ b/test/components/format_conversion_kernels.cpp @@ -63,7 +63,7 @@ TYPED_TEST(FormatConversion, ConvertsEmptyPtrsToIdxs) ptrs.fill(0); TypeParam* output = nullptr; - gko::kernels::EXEC_NAMESPACE::components::convert_ptrs_to_idxs( + gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_ptrs_to_idxs( this->exec, ptrs.get_const_data(), this->size, output); // mustn't segfault @@ -75,7 +75,7 @@ TYPED_TEST(FormatConversion, ConvertPtrsToIdxs) auto ref_idxs = this->idxs; this->idxs.fill(-1); - gko::kernels::EXEC_NAMESPACE::components::convert_ptrs_to_idxs( + gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_ptrs_to_idxs( this->exec, this->ptrs.get_const_data(), this->size, this->idxs.get_data()); @@ -90,7 +90,7 @@ TYPED_TEST(FormatConversion, ConvertsEmptyIdxsToPtrs) this->ptrs.fill(-1); TypeParam* input = nullptr; - gko::kernels::EXEC_NAMESPACE::components::convert_idxs_to_ptrs( + gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_idxs_to_ptrs( this->exec, input, 0, this->size, this->ptrs.get_data()); GKO_ASSERT_ARRAY_EQ(this->ptrs, ref_ptrs); @@ -102,7 +102,7 @@ TYPED_TEST(FormatConversion, ConvertIdxsToPtrsIsEquivalentToRef) auto ref_ptrs = this->ptrs; this->ptrs.fill(-1); - gko::kernels::EXEC_NAMESPACE::components::convert_idxs_to_ptrs( + gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_idxs_to_ptrs( this->exec, this->idxs.get_const_data(), this->idxs.get_size(), this->size, this->ptrs.get_data()); @@ -115,7 +115,7 @@ TYPED_TEST(FormatConversion, ConvertPtrsToSizesIsEquivalentToRef) auto ref_sizes = this->sizes; this->sizes.fill(12345); - gko::kernels::EXEC_NAMESPACE::components::convert_ptrs_to_sizes( + gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_ptrs_to_sizes( this->exec, this->ptrs.get_const_data(), this->size, this->sizes.get_data()); diff --git a/test/components/prefix_sum_kernels.cpp b/test/components/prefix_sum_kernels.cpp index cf1777bb6ae..73cb0c7874e 100644 --- a/test/components/prefix_sum_kernels.cpp +++ b/test/components/prefix_sum_kernels.cpp @@ -57,7 +57,7 @@ TYPED_TEST(PrefixSum, EqualsReference) SCOPED_TRACE(size); gko::kernels::reference::components::prefix_sum_nonnegative( this->ref, this->vals.get_data(), size); - gko::kernels::EXEC_NAMESPACE::components::prefix_sum_nonnegative( + gko::kernels::GKO_DEVICE_NAMESPACE::components::prefix_sum_nonnegative( this->exec, this->dvals.get_data(), size); GKO_ASSERT_ARRAY_EQ(this->vals, this->dvals); @@ -74,7 +74,7 @@ TYPED_TEST(PrefixSum, WorksCloseToOverflow) std::is_unsigned::value; gko::array data{this->exec, I({max - 1, 1, 0})}; - gko::kernels::EXEC_NAMESPACE::components::prefix_sum_nonnegative( + gko::kernels::GKO_DEVICE_NAMESPACE::components::prefix_sum_nonnegative( this->exec, data.get_data(), data.get_size()); GKO_ASSERT_ARRAY_EQ(data, I({0, max - 1, max})); @@ -86,7 +86,7 @@ TYPED_TEST(PrefixSum, DoesntOverflowFromLastElement) const auto max = std::numeric_limits::max(); gko::array data{this->exec, I({2, max - 1})}; - gko::kernels::EXEC_NAMESPACE::components::prefix_sum_nonnegative( + gko::kernels::GKO_DEVICE_NAMESPACE::components::prefix_sum_nonnegative( this->exec, data.get_data(), data.get_size()); GKO_ASSERT_ARRAY_EQ(data, I({0, 2})); @@ -103,7 +103,7 @@ TYPED_TEST(PrefixSum, ThrowsOnOverflow) {max / 3, max / 2, max / 4, max / 3, max / 4}}; ASSERT_THROW( - gko::kernels::EXEC_NAMESPACE::components::prefix_sum_nonnegative( + gko::kernels::GKO_DEVICE_NAMESPACE::components::prefix_sum_nonnegative( this->exec, data.get_data(), data.get_size()), gko::OverflowError); } diff --git a/test/components/reduce_array_kernels.cpp b/test/components/reduce_array_kernels.cpp index cd6c2a8d7bf..dfc2e046c84 100644 --- a/test/components/reduce_array_kernels.cpp +++ b/test/components/reduce_array_kernels.cpp @@ -50,7 +50,7 @@ TYPED_TEST(ReduceArray, EqualsReference) { gko::kernels::reference::components::reduce_add_array(this->ref, this->vals, this->out); - gko::kernels::EXEC_NAMESPACE::components::reduce_add_array( + gko::kernels::GKO_DEVICE_NAMESPACE::components::reduce_add_array( this->exec, this->dvals, this->dout); GKO_ASSERT_ARRAY_EQ(this->out, this->dout); diff --git a/test/distributed/index_map_kernels.cpp b/test/distributed/index_map_kernels.cpp index 458ca594a56..cafd7b4da35 100644 --- a/test/distributed/index_map_kernels.cpp +++ b/test/distributed/index_map_kernels.cpp @@ -97,7 +97,7 @@ TEST_F(IndexMapBuildMapping, BuildMappingSameAsRef) gko::kernels::reference::index_map::build_mapping( ref, part.get(), query, target_ids, remote_local_idxs, remote_global_idxs, remote_sizes); - gko::kernels::EXEC_NAMESPACE::index_map::build_mapping( + gko::kernels::GKO_DEVICE_NAMESPACE::index_map::build_mapping( exec, dpart.get(), dquery, dtarget_ids, dremote_local_idxs, dremote_global_idxs, dremote_sizes); @@ -136,7 +136,7 @@ class IndexMap : public CommonTestFixture { gko::kernels::reference::index_map::build_mapping( ref, part.get(), connections, target_ids, flat_remote_local_idxs, flat_remote_global_idxs, remote_sizes); - gko::kernels::EXEC_NAMESPACE::index_map::build_mapping( + gko::kernels::GKO_DEVICE_NAMESPACE::index_map::build_mapping( exec, dpart.get(), dconnections, dtarget_ids, dflat_remote_local_idxs, dflat_remote_global_idxs, dremote_sizes); @@ -247,7 +247,7 @@ TEST_F(IndexMap, GetLocalWithLocalIndexSpaceSameAsRef) ref, part.get(), target_ids, to_device_const(remote_global_idxs), this_rank, query, gko::experimental::distributed::index_space::local, result); - gko::kernels::EXEC_NAMESPACE::index_map::map_to_local( + gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local( exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs), this_rank, dquery, gko::experimental::distributed::index_space::local, dresult); @@ -275,7 +275,7 @@ TEST_F(IndexMap, GetLocalWithLocalIndexSpaceWithInvalidIndexSameAsRef) ref, part.get(), target_ids, to_device_const(remote_global_idxs), this_rank, query, gko::experimental::distributed::index_space::local, result); - gko::kernels::EXEC_NAMESPACE::index_map::map_to_local( + gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local( exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs), this_rank, dquery, gko::experimental::distributed::index_space::local, dresult); @@ -304,7 +304,7 @@ TEST_F(IndexMap, GetLocalWithNonLocalIndexSpaceSameAsRef) ref, part.get(), target_ids, to_device_const(remote_global_idxs), this_rank, query, gko::experimental::distributed::index_space::non_local, result); - gko::kernels::EXEC_NAMESPACE::index_map::map_to_local( + gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local( exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs), this_rank, dquery, gko::experimental::distributed::index_space::non_local, dresult); @@ -330,7 +330,7 @@ TEST_F(IndexMap, GetLocalWithNonLocalIndexSpaceWithInvalidIndexSameAsRef) ref, part.get(), target_ids, to_device_const(remote_global_idxs), this_rank, query, gko::experimental::distributed::index_space::non_local, result); - gko::kernels::EXEC_NAMESPACE::index_map::map_to_local( + gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local( exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs), this_rank, dquery, gko::experimental::distributed::index_space::non_local, dresult); @@ -355,7 +355,7 @@ TEST_F(IndexMap, GetLocalWithCombinedIndexSpaceSameAsRef) ref, part.get(), target_ids, to_device_const(remote_global_idxs), this_rank, query, gko::experimental::distributed::index_space::combined, result); - gko::kernels::EXEC_NAMESPACE::index_map::map_to_local( + gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local( exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs), this_rank, dquery, gko::experimental::distributed::index_space::combined, dresult); @@ -385,7 +385,7 @@ TEST_F(IndexMap, GetLocalWithCombinedIndexSpaceWithInvalidIndexSameAsRef) ref, part.get(), target_ids, to_device_const(remote_global_idxs), this_rank, query, gko::experimental::distributed::index_space::non_local, result); - gko::kernels::EXEC_NAMESPACE::index_map::map_to_local( + gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local( exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs), this_rank, dquery, gko::experimental::distributed::index_space::non_local, dresult); diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp index 5e3677db2f4..8445aee6a0e 100644 --- a/test/distributed/matrix_kernels.cpp +++ b/test/distributed/matrix_kernels.cpp @@ -72,7 +72,7 @@ class Matrix : public CommonTestFixture { ref, input, row_partition.get(), col_partition.get(), part, local_row_idxs, local_col_idxs, local_values, non_local_row_idxs, non_local_col_idxs, non_local_values); - gko::kernels::EXEC_NAMESPACE::distributed_matrix:: + gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix:: separate_local_nonlocal( exec, d_input, d_row_partition.get(), d_col_partition.get(), part, d_local_row_idxs, d_local_col_idxs, d_local_values, diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp index 8121a720908..9e985ffec9e 100644 --- a/test/distributed/partition_helper_kernels.cpp +++ b/test/distributed/partition_helper_kernels.cpp @@ -147,8 +147,8 @@ TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRanges) auto offsets = make_array(this->exec, create_ranges(100)); bool result = false; - gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( - this->exec, offsets, result); + gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers:: + check_consecutive_ranges(this->exec, offsets, result); ASSERT_TRUE(result); } @@ -163,8 +163,8 @@ TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges) make_array(this->exec, remove_indices(full_range_ends, removal_idxs)); bool result = true; - gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( - this->exec, start_ends, result); + gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers:: + check_consecutive_ranges(this->exec, start_ends, result); ASSERT_FALSE(result); } @@ -176,8 +176,8 @@ TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleRange) auto start_ends = make_array(this->ref, create_ranges(1)); bool result = false; - gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( - this->exec, start_ends, result); + gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers:: + check_consecutive_ranges(this->exec, start_ends, result); ASSERT_TRUE(result); } @@ -189,8 +189,8 @@ TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleElement) auto start_ends = gko::array(this->exec, {1}); bool result = false; - gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( - this->exec, start_ends, result); + gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers:: + check_consecutive_ranges(this->exec, start_ends, result); ASSERT_TRUE(result); } @@ -206,7 +206,7 @@ TYPED_TEST(PartitionHelpers, CanSortConsecutiveRanges) auto expected_start_ends = start_ends; auto expected_part_ids = part_ids_arr; - gko::kernels::EXEC_NAMESPACE::partition_helpers::sort_by_range_start( + gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::sort_by_range_start( this->exec, start_ends, part_ids_arr); GKO_ASSERT_ARRAY_EQ(expected_start_ends, start_ends); @@ -227,7 +227,7 @@ TYPED_TEST(PartitionHelpers, CanSortNonConsecutiveRanges) auto part_ids_arr = gko::array( this->exec, shuffled.second.begin(), shuffled.second.end()); - gko::kernels::EXEC_NAMESPACE::partition_helpers::sort_by_range_start( + gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::sort_by_range_start( this->exec, start_ends, part_ids_arr); GKO_ASSERT_ARRAY_EQ(expected_start_ends, start_ends); @@ -242,7 +242,7 @@ TYPED_TEST(PartitionHelpers, CanCompressRanges) auto ranges = make_array(this->exec, create_ranges(expected_offsets)); gko::array offsets{this->exec, expected_offsets.size()}; - gko::kernels::EXEC_NAMESPACE::partition_helpers::compress_ranges( + gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::compress_ranges( this->exec, ranges, offsets); GKO_ASSERT_ARRAY_EQ(offsets, make_array(this->exec, expected_offsets)); diff --git a/test/distributed/vector_kernels.cpp b/test/distributed/vector_kernels.cpp index e8e3d6a7e7b..86faca6b2b2 100644 --- a/test/distributed/vector_kernels.cpp +++ b/test/distributed/vector_kernels.cpp @@ -61,7 +61,7 @@ class Vector : public CommonTestFixture { gko::kernels::reference::distributed_vector::build_local( ref, input, partition.get(), part, output.get()); - gko::kernels::EXEC_NAMESPACE::distributed_vector::build_local( + gko::kernels::GKO_DEVICE_NAMESPACE::distributed_vector::build_local( exec, d_input, d_partition.get(), part, d_output.get()); GKO_ASSERT_MTX_NEAR(output, d_output, 0); diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp index 82c59477fd8..c1d0a6c7336 100644 --- a/test/factorization/cholesky_kernels.cpp +++ b/test/factorization/cholesky_kernels.cpp @@ -150,7 +150,7 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicCount) gko::kernels::reference::cholesky::symbolic_count( this->ref, mtx.get(), *forest, row_nnz.get_data(), this->tmp); - gko::kernels::EXEC_NAMESPACE::cholesky::symbolic_count( + gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::symbolic_count( this->exec, dmtx.get(), *dforest, drow_nnz.get_data(), this->dtmp); GKO_ASSERT_ARRAY_EQ(drow_nnz, row_nnz); @@ -189,12 +189,12 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicFactorize) std::unique_ptr dforest; gko::factorization::compute_elim_forest(dmtx.get(), dforest); gko::array dtmp_ptrs{this->exec, num_rows + 1}; - gko::kernels::EXEC_NAMESPACE::cholesky::symbolic_count( + gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::symbolic_count( this->exec, dmtx.get(), *dforest, dtmp_ptrs.get_data(), this->dtmp); gko::kernels::reference::cholesky::symbolic_factorize( this->ref, mtx.get(), *forest, l_factor.get(), this->tmp); - gko::kernels::EXEC_NAMESPACE::cholesky::symbolic_factorize( + gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::symbolic_factorize( this->exec, dmtx.get(), *dforest, dl_factor.get(), this->dtmp); GKO_ASSERT_MTX_EQ_SPARSITY(dl_factor, l_factor); @@ -239,7 +239,7 @@ TYPED_TEST(CholeskySymbolic, KernelForestFromFactorWorks) elimination_forest dforest{this->exec, static_cast(mtx->get_size()[0])}; - gko::kernels::EXEC_NAMESPACE::cholesky::forest_from_factor( + gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::forest_from_factor( this->exec, dfactors.get(), dforest); this->assert_equal_forests(*forest, dforest); @@ -367,7 +367,7 @@ TYPED_TEST(Cholesky, KernelInitializeIsEquivalentToRef) this->forall_matrices([this] { const auto nnz = this->mtx_chol->get_num_stored_elements(); std::fill_n(this->mtx_chol->get_values(), nnz, gko::zero()); - gko::kernels::EXEC_NAMESPACE::components::fill_array( + gko::kernels::GKO_DEVICE_NAMESPACE::components::fill_array( this->exec, this->dmtx_chol->get_values(), nnz, gko::zero()); gko::array diag_idxs{this->ref, this->num_rows}; @@ -380,7 +380,7 @@ TYPED_TEST(Cholesky, KernelInitializeIsEquivalentToRef) this->row_descs.get_const_data(), this->storage.get_const_data(), diag_idxs.get_data(), transpose_idxs.get_data(), this->mtx_chol.get()); - gko::kernels::EXEC_NAMESPACE::cholesky::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::initialize( this->exec, this->dmtx.get(), this->dstorage_offsets.get_const_data(), this->drow_descs.get_const_data(), this->dstorage.get_const_data(), @@ -410,7 +410,7 @@ TYPED_TEST(Cholesky, KernelFactorizeIsEquivalentToRef) this->row_descs.get_const_data(), this->storage.get_const_data(), diag_idxs.get_data(), transpose_idxs.get_data(), this->mtx_chol.get()); - gko::kernels::EXEC_NAMESPACE::cholesky::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::initialize( this->exec, this->dmtx.get(), this->dstorage_offsets.get_const_data(), this->drow_descs.get_const_data(), this->dstorage.get_const_data(), @@ -422,7 +422,7 @@ TYPED_TEST(Cholesky, KernelFactorizeIsEquivalentToRef) this->row_descs.get_const_data(), this->storage.get_const_data(), diag_idxs.get_const_data(), transpose_idxs.get_const_data(), *this->forest, this->mtx_chol.get(), tmp); - gko::kernels::EXEC_NAMESPACE::cholesky::factorize( + gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::factorize( this->exec, this->dstorage_offsets.get_const_data(), this->drow_descs.get_const_data(), this->dstorage.get_const_data(), ddiag_idxs.get_const_data(), dtranspose_idxs.get_const_data(), diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp index fdcaa0cfad0..0ea06bed506 100644 --- a/test/factorization/lu_kernels.cpp +++ b/test/factorization/lu_kernels.cpp @@ -156,7 +156,7 @@ TYPED_TEST(Lu, KernelInitializeIsEquivalentToRef) std::fill_n(this->mtx_lu->get_values(), this->mtx_lu->get_num_stored_elements(), gko::zero()); - gko::kernels::EXEC_NAMESPACE::components::fill_array( + gko::kernels::GKO_DEVICE_NAMESPACE::components::fill_array( this->exec, this->dmtx_lu->get_values(), this->dmtx_lu->get_num_stored_elements(), gko::zero()); gko::array diag_idxs{this->ref, this->num_rows}; @@ -166,7 +166,7 @@ TYPED_TEST(Lu, KernelInitializeIsEquivalentToRef) this->ref, this->mtx.get(), this->storage_offsets.get_const_data(), this->row_descs.get_const_data(), this->storage.get_const_data(), diag_idxs.get_data(), this->mtx_lu.get()); - gko::kernels::EXEC_NAMESPACE::lu_factorization::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::lu_factorization::initialize( this->exec, this->dmtx.get(), this->dstorage_offsets.get_const_data(), this->drow_descs.get_const_data(), this->dstorage.get_const_data(), @@ -191,7 +191,7 @@ TYPED_TEST(Lu, KernelFactorizeIsEquivalentToRef) this->ref, this->mtx.get(), this->storage_offsets.get_const_data(), this->row_descs.get_const_data(), this->storage.get_const_data(), diag_idxs.get_data(), this->mtx_lu.get()); - gko::kernels::EXEC_NAMESPACE::lu_factorization::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::lu_factorization::initialize( this->exec, this->dmtx.get(), this->dstorage_offsets.get_const_data(), this->drow_descs.get_const_data(), this->dstorage.get_const_data(), @@ -201,7 +201,7 @@ TYPED_TEST(Lu, KernelFactorizeIsEquivalentToRef) this->ref, this->storage_offsets.get_const_data(), this->row_descs.get_const_data(), this->storage.get_const_data(), diag_idxs.get_const_data(), this->mtx_lu.get(), tmp); - gko::kernels::EXEC_NAMESPACE::lu_factorization::factorize( + gko::kernels::GKO_DEVICE_NAMESPACE::lu_factorization::factorize( this->exec, this->dstorage_offsets.get_const_data(), this->drow_descs.get_const_data(), this->dstorage.get_const_data(), ddiag_idxs.get_const_data(), this->dmtx_lu.get(), dtmp); diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp index 57086a1550d..40a40b5acf5 100644 --- a/test/factorization/par_ic_kernels.cpp +++ b/test/factorization/par_ic_kernels.cpp @@ -100,7 +100,7 @@ TYPED_TEST(ParIc, KernelInitFactorIsEquivalentToRef) gko::kernels::reference::par_ic_factorization::init_factor( this->ref, this->mtx_l.get()); - gko::kernels::EXEC_NAMESPACE::par_ic_factorization::init_factor( + gko::kernels::GKO_DEVICE_NAMESPACE::par_ic_factorization::init_factor( this->exec, this->dmtx_l.get()); GKO_ASSERT_MTX_NEAR(this->mtx_l, this->dmtx_l, r::value); @@ -118,7 +118,7 @@ TYPED_TEST(ParIc, KernelComputeFactorIsEquivalentToRef) gko::kernels::reference::par_ic_factorization::compute_factor( this->ref, 1, mtx_l_coo.get(), this->mtx_l_ani_init.get()); - gko::kernels::EXEC_NAMESPACE::par_ic_factorization::compute_factor( + gko::kernels::GKO_DEVICE_NAMESPACE::par_ic_factorization::compute_factor( this->exec, 100, dmtx_l_coo.get(), this->dmtx_l_ani_init.get()); GKO_ASSERT_MTX_NEAR(this->mtx_l_ani_init, this->dmtx_l_ani_init, 1e-4); diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp index 254c2e4a40e..81d1dd83ffb 100644 --- a/test/factorization/par_ict_kernels.cpp +++ b/test/factorization/par_ict_kernels.cpp @@ -118,7 +118,7 @@ TYPED_TEST(ParIct, KernelAddCandidatesIsEquivalentToRef) gko::kernels::reference::par_ict_factorization::add_candidates( this->ref, mtx_llh.get(), this->mtx.get(), this->mtx_l.get(), res_mtx_l.get()); - gko::kernels::EXEC_NAMESPACE::par_ict_factorization::add_candidates( + gko::kernels::GKO_DEVICE_NAMESPACE::par_ict_factorization::add_candidates( this->exec, dmtx_llh.get(), this->dmtx.get(), this->dmtx_l.get(), dres_mtx_l.get()); @@ -140,9 +140,9 @@ TYPED_TEST(ParIct, KernelComputeFactorIsEquivalentToRef) gko::kernels::reference::par_ict_factorization::compute_factor( this->ref, this->mtx_ani.get(), this->mtx_l_ani.get(), mtx_l_coo.get()); for (int i = 0; i < 20; ++i) { - gko::kernels::EXEC_NAMESPACE::par_ict_factorization::compute_factor( - this->exec, this->dmtx_ani.get(), this->dmtx_l_ani.get(), - dmtx_l_coo.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::par_ict_factorization:: + compute_factor(this->exec, this->dmtx_ani.get(), + this->dmtx_l_ani.get(), dmtx_l_coo.get()); } GKO_ASSERT_MTX_NEAR(this->mtx_l_ani, this->dmtx_l_ani, 1e-2); diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp index 94e2eb6512f..0d853af0745 100644 --- a/test/factorization/par_ilu_kernels.cpp +++ b/test/factorization/par_ilu_kernels.cpp @@ -89,8 +89,8 @@ class ParIlu : public CommonTestFixture { { gko::kernels::reference::factorization::initialize_row_ptrs_l_u( ref, mtx.get(), l_row_ptrs, u_row_ptrs); - gko::kernels::EXEC_NAMESPACE::factorization::initialize_row_ptrs_l_u( - exec, dmtx.get(), dl_row_ptrs, du_row_ptrs); + gko::kernels::GKO_DEVICE_NAMESPACE::factorization:: + initialize_row_ptrs_l_u(exec, dmtx.get(), dl_row_ptrs, du_row_ptrs); } void initialize_lu(std::unique_ptr& l, std::unique_ptr& u, @@ -121,7 +121,7 @@ class ParIlu : public CommonTestFixture { gko::kernels::reference::factorization::initialize_l_u( ref, mtx.get(), l.get(), u.get()); - gko::kernels::EXEC_NAMESPACE::factorization::initialize_l_u( + gko::kernels::GKO_DEVICE_NAMESPACE::factorization::initialize_l_u( exec, dmtx.get(), dl.get(), du.get()); } @@ -139,7 +139,7 @@ class ParIlu : public CommonTestFixture { gko::kernels::reference::par_ilu_factorization::compute_l_u_factors( ref, iterations, coo.get(), l.get(), u_transpose_mtx.get()); - gko::kernels::EXEC_NAMESPACE::par_ilu_factorization:: + gko::kernels::GKO_DEVICE_NAMESPACE::par_ilu_factorization:: compute_l_u_factors(exec, iterations, dcoo.get(), dl.get(), u_transpose_dmtx.get()); auto u_lin_op = u_transpose_mtx->transpose(); @@ -160,7 +160,7 @@ TYPED_TEST(ParIlu, KernelAddDiagonalElementsSortedEquivalentToRef) gko::kernels::reference::factorization::add_diagonal_elements( this->ref, mtx.get(), true); - gko::kernels::EXEC_NAMESPACE::factorization::add_diagonal_elements( + gko::kernels::GKO_DEVICE_NAMESPACE::factorization::add_diagonal_elements( this->exec, dmtx.get(), true); ASSERT_TRUE(mtx->is_sorted_by_column_index()); @@ -176,7 +176,7 @@ TYPED_TEST(ParIlu, KernelAddDiagonalElementsUnsortedEquivalentToRef) gko::kernels::reference::factorization::add_diagonal_elements( this->ref, mtx.get(), false); - gko::kernels::EXEC_NAMESPACE::factorization::add_diagonal_elements( + gko::kernels::GKO_DEVICE_NAMESPACE::factorization::add_diagonal_elements( this->exec, dmtx.get(), false); ASSERT_FALSE(mtx->is_sorted_by_column_index()); @@ -193,7 +193,7 @@ TYPED_TEST(ParIlu, KernelAddDiagonalElementsNonSquareEquivalentToRef) gko::kernels::reference::factorization::add_diagonal_elements( this->ref, mtx.get(), true); - gko::kernels::EXEC_NAMESPACE::factorization::add_diagonal_elements( + gko::kernels::GKO_DEVICE_NAMESPACE::factorization::add_diagonal_elements( this->exec, dmtx.get(), true); ASSERT_TRUE(mtx->is_sorted_by_column_index()); diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp index c4ad7fe412a..7d46f7979ac 100644 --- a/test/factorization/par_ilut_kernels.cpp +++ b/test/factorization/par_ilut_kernels.cpp @@ -151,8 +151,8 @@ class ParIlut : public CommonTestFixture { gko::kernels::reference::par_ilut_factorization::threshold_select( ref, mtx.get(), rank, tmp, tmp2, res); - gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::threshold_select( - exec, dmtx.get(), rank, dtmp, dtmp2, dres); + gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization:: + threshold_select(exec, dmtx.get(), rank, dtmp, dtmp2, dres); ASSERT_NEAR(res, dres, tolerance); } @@ -174,9 +174,9 @@ class ParIlut : public CommonTestFixture { gko::kernels::reference::par_ilut_factorization::threshold_filter( ref, local_mtx.get(), threshold, res.get(), res_coo.get(), lower); - gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::threshold_filter( - exec, local_dmtx.get(), threshold, dres.get(), dres_coo.get(), - lower); + gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization:: + threshold_filter(exec, local_dmtx.get(), threshold, dres.get(), + dres_coo.get(), lower); GKO_ASSERT_MTX_NEAR(res, dres, 0); GKO_ASSERT_MTX_EQ_SPARSITY(res, dres); @@ -208,7 +208,7 @@ class ParIlut : public CommonTestFixture { gko::kernels::reference::par_ilut_factorization:: threshold_filter_approx(ref, mtx.get(), rank, tmp, threshold, res.get(), res_coo.get()); - gko::kernels::EXEC_NAMESPACE::par_ilut_factorization:: + gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization:: threshold_filter_approx(exec, dmtx.get(), rank, dtmp, dthreshold, dres.get(), dres_coo.get()); @@ -283,8 +283,9 @@ TYPED_TEST(ParIlut, KernelThresholdFilterNullptrCooIsEquivalentToRef) gko::kernels::reference::par_ilut_factorization::threshold_filter( this->ref, this->mtx_l.get(), 0.5, res.get(), null_coo, true); - gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::threshold_filter( - this->exec, this->dmtx_l.get(), 0.5, dres.get(), null_coo, true); + gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization:: + threshold_filter(this->exec, this->dmtx_l.get(), 0.5, dres.get(), + null_coo, true); GKO_ASSERT_MTX_NEAR(res, dres, 0); GKO_ASSERT_MTX_EQ_SPARSITY(res, dres); @@ -346,7 +347,7 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef) gko::kernels::reference::par_ilut_factorization::threshold_filter_approx( this->ref, this->mtx_l.get(), rank, tmp, threshold, res.get(), null_coo); - gko::kernels::EXEC_NAMESPACE::par_ilut_factorization:: + gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization:: threshold_filter_approx(this->exec, this->dmtx_l.get(), rank, dtmp, dthreshold, dres.get(), null_coo); @@ -393,7 +394,7 @@ TYPED_TEST(ParIlut, KernelAddCandidatesIsEquivalentToRef) gko::kernels::reference::par_ilut_factorization::add_candidates( this->ref, mtx_lu.get(), this->mtx_square.get(), this->mtx_l2.get(), this->mtx_u.get(), res_mtx_l.get(), res_mtx_u.get()); - gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::add_candidates( + gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization::add_candidates( this->exec, dmtx_lu.get(), this->dmtx_square.get(), this->dmtx_l2.get(), this->dmtx_u.get(), dres_mtx_l.get(), dres_mtx_u.get()); @@ -422,7 +423,7 @@ TYPED_TEST(ParIlut, KernelComputeLUIsEquivalentToRef) this->ref, this->mtx_ani.get(), this->mtx_l_ani.get(), mtx_l_coo.get(), this->mtx_u_ani.get(), mtx_u_coo.get(), this->mtx_ut_ani.get()); for (int i = 0; i < 20; ++i) { - gko::kernels::EXEC_NAMESPACE::par_ilut_factorization:: + gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization:: compute_l_u_factors(this->exec, this->dmtx_ani.get(), this->dmtx_l_ani.get(), dmtx_l_coo.get(), this->dmtx_u_ani.get(), dmtx_u_coo.get(), diff --git a/test/matrix/csr_kernels.cpp b/test/matrix/csr_kernels.cpp index 347425175bb..d3a7bb8f8e5 100644 --- a/test/matrix/csr_kernels.cpp +++ b/test/matrix/csr_kernels.cpp @@ -149,7 +149,7 @@ void assert_lookup_correct(std::shared_ptr exec, const auto row_ptrs = mtx->get_const_row_ptrs(); const auto col_idxs = mtx->get_const_col_idxs(); gko::array correct{exec, {true}}; - gko::kernels::EXEC_NAMESPACE::run_kernel( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( exec, [] GKO_KERNEL(auto row, auto num_cols, auto row_ptrs, auto col_idxs, auto storage_offsets, auto storage, auto row_descs, @@ -215,7 +215,7 @@ TYPED_TEST(CsrLookup, BuildLookupWorks) // otherwise things might crash gko::kernels::reference::csr::build_lookup_offsets( this->ref, row_ptrs, col_idxs, num_rows, allowed, storage_offsets); - gko::kernels::EXEC_NAMESPACE::csr::build_lookup_offsets( + gko::kernels::GKO_DEVICE_NAMESPACE::csr::build_lookup_offsets( this->exec, drow_ptrs, dcol_idxs, num_rows, allowed, dstorage_offsets); @@ -238,7 +238,7 @@ TYPED_TEST(CsrLookup, BuildLookupWorks) gko::kernels::reference::csr::build_lookup( this->ref, row_ptrs, col_idxs, num_rows, allowed, storage_offsets, row_descs, storage); - gko::kernels::EXEC_NAMESPACE::csr::build_lookup( + gko::kernels::GKO_DEVICE_NAMESPACE::csr::build_lookup( this->exec, drow_ptrs, dcol_idxs, num_rows, allowed, dstorage_offsets, drow_descs, dstorage); diff --git a/test/matrix/csr_kernels2.cpp b/test/matrix/csr_kernels2.cpp index 713593b4ae5..4ff8e749766 100644 --- a/test/matrix/csr_kernels2.cpp +++ b/test/matrix/csr_kernels2.cpp @@ -1346,7 +1346,7 @@ TEST_F(Csr, CalculateNnzPerRowInSpanIsEquivalentToRef) gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span( this->ref, this->mtx2.get(), rspan, cspan, &row_nnz); - gko::kernels::EXEC_NAMESPACE::csr::calculate_nonzeros_per_row_in_span( + gko::kernels::GKO_DEVICE_NAMESPACE::csr::calculate_nonzeros_per_row_in_span( this->exec, this->dmtx2.get(), rspan, cspan, &drow_nnz); GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz); @@ -1382,7 +1382,7 @@ TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef) gko::kernels::reference::csr::compute_submatrix(this->ref, this->mtx2.get(), rspan, cspan, smat1.get()); - gko::kernels::EXEC_NAMESPACE::csr::compute_submatrix( + gko::kernels::GKO_DEVICE_NAMESPACE::csr::compute_submatrix( this->exec, this->dmtx2.get(), rspan, cspan, sdmat1.get()); GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0); @@ -1408,8 +1408,9 @@ TEST_F(Csr, CalculateNnzPerRowInIndexSetIsEquivalentToRef) gko::kernels::reference::csr::calculate_nonzeros_per_row_in_index_set( this->ref, this->mtx2.get(), rset, cset, row_nnz.get_data()); - gko::kernels::EXEC_NAMESPACE::csr::calculate_nonzeros_per_row_in_index_set( - this->exec, this->dmtx2.get(), drset, dcset, drow_nnz.get_data()); + gko::kernels::GKO_DEVICE_NAMESPACE::csr:: + calculate_nonzeros_per_row_in_index_set( + this->exec, this->dmtx2.get(), drset, dcset, drow_nnz.get_data()); GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz); } @@ -1446,7 +1447,7 @@ TEST_F(Csr, ComputeSubmatrixFromIndexSetIsEquivalentToRef) gko::kernels::reference::csr::compute_submatrix_from_index_set( this->ref, this->mtx2.get(), rset, cset, smat1.get()); - gko::kernels::EXEC_NAMESPACE::csr::compute_submatrix_from_index_set( + gko::kernels::GKO_DEVICE_NAMESPACE::csr::compute_submatrix_from_index_set( this->exec, this->dmtx2.get(), drset, dcset, sdmat1.get()); GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0); @@ -1501,7 +1502,7 @@ TEST_F(Csr, CanDetectMissingDiagonalEntry) auto mtx = gko::clone(exec, ref_mtx); bool has_diags = true; - gko::kernels::EXEC_NAMESPACE::csr::check_diagonal_entries_exist( + gko::kernels::GKO_DEVICE_NAMESPACE::csr::check_diagonal_entries_exist( exec, mtx.get(), has_diags); ASSERT_FALSE(has_diags); @@ -1516,7 +1517,7 @@ TEST_F(Csr, CanDetectWhenAllDiagonalEntriesArePresent) auto mtx = gko::clone(exec, ref_mtx); bool has_diags = true; - gko::kernels::EXEC_NAMESPACE::csr::check_diagonal_entries_exist( + gko::kernels::GKO_DEVICE_NAMESPACE::csr::check_diagonal_entries_exist( exec, mtx.get(), has_diags); ASSERT_TRUE(has_diags); diff --git a/test/matrix/dense_kernels.cpp b/test/matrix/dense_kernels.cpp index 25b82215dcd..56ca536187e 100644 --- a/test/matrix/dense_kernels.cpp +++ b/test/matrix/dense_kernels.cpp @@ -603,7 +603,7 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) gko::kernels::reference::dense::count_nonzeros_per_row( ref, x.get(), nnz_per_row.get_data()); - gko::kernels::EXEC_NAMESPACE::dense::count_nonzeros_per_row( + gko::kernels::GKO_DEVICE_NAMESPACE::dense::count_nonzeros_per_row( exec, dx.get(), dnnz_per_row.get_data()); auto tmp = gko::array(ref, dnnz_per_row); @@ -621,8 +621,8 @@ TEST_F(Dense, ComputeMaxNNZPerRowIsEquivalentToRef) gko::kernels::reference::dense::compute_max_nnz_per_row(ref, x.get(), max_nnz); - gko::kernels::EXEC_NAMESPACE::dense::compute_max_nnz_per_row(exec, dx.get(), - dmax_nnz); + gko::kernels::GKO_DEVICE_NAMESPACE::dense::compute_max_nnz_per_row( + exec, dx.get(), dmax_nnz); ASSERT_EQ(max_nnz, dmax_nnz); } @@ -2017,7 +2017,7 @@ TEST_F(Dense, ComputeNorm2SquaredIsEquivalentToRef) gko::kernels::reference::dense::compute_squared_norm2( ref, x.get(), norm_expected.get(), tmp); - gko::kernels::EXEC_NAMESPACE::dense::compute_squared_norm2( + gko::kernels::GKO_DEVICE_NAMESPACE::dense::compute_squared_norm2( exec, dx.get(), dnorm.get(), dtmp); GKO_ASSERT_MTX_NEAR(dnorm, norm_expected, r::value); @@ -2033,7 +2033,7 @@ TEST_F(Dense, ComputesSqrt) auto dmtx = gko::clone(exec, mtx); gko::kernels::reference::dense::compute_sqrt(ref, mtx.get()); - gko::kernels::EXEC_NAMESPACE::dense::compute_sqrt(exec, dmtx.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::dense::compute_sqrt(exec, dmtx.get()); GKO_ASSERT_MTX_NEAR(mtx, dmtx, r::value); } diff --git a/test/matrix/ell_kernels.cpp b/test/matrix/ell_kernels.cpp index f6b9a9d1edb..b61d97a0a7a 100644 --- a/test/matrix/ell_kernels.cpp +++ b/test/matrix/ell_kernels.cpp @@ -533,7 +533,7 @@ TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef) gko::kernels::reference::ell::count_nonzeros_per_row( ref, mtx.get(), nnz_per_row.get_data()); - gko::kernels::EXEC_NAMESPACE::ell::count_nonzeros_per_row( + gko::kernels::GKO_DEVICE_NAMESPACE::ell::count_nonzeros_per_row( exec, dmtx.get(), dnnz_per_row.get_data()); GKO_ASSERT_ARRAY_EQ(nnz_per_row, dnnz_per_row); diff --git a/test/matrix/sparsity_csr_kernels.cpp b/test/matrix/sparsity_csr_kernels.cpp index 6fc3caf60ad..010bd7faa86 100644 --- a/test/matrix/sparsity_csr_kernels.cpp +++ b/test/matrix/sparsity_csr_kernels.cpp @@ -64,8 +64,8 @@ TEST_F(SparsityCsr, KernelDiagonalElementPrefixSumIsEquivalentToRef) gko::kernels::reference::sparsity_csr::diagonal_element_prefix_sum( ref, mtx.get(), prefix_sum.get_data()); - gko::kernels::EXEC_NAMESPACE::sparsity_csr::diagonal_element_prefix_sum( - exec, dmtx.get(), dprefix_sum.get_data()); + gko::kernels::GKO_DEVICE_NAMESPACE::sparsity_csr:: + diagonal_element_prefix_sum(exec, dmtx.get(), dprefix_sum.get_data()); GKO_ASSERT_ARRAY_EQ(prefix_sum, dprefix_sum); } @@ -88,7 +88,7 @@ TEST_F(SparsityCsr, KernelRemoveDiagonalElementsIsEquivalentToRef) gko::kernels::reference::sparsity_csr::remove_diagonal_elements( ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), prefix_sum.get_const_data(), out_mtx.get()); - gko::kernels::EXEC_NAMESPACE::sparsity_csr::remove_diagonal_elements( + gko::kernels::GKO_DEVICE_NAMESPACE::sparsity_csr::remove_diagonal_elements( exec, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), dprefix_sum.get_const_data(), dout_mtx.get()); diff --git a/test/multigrid/pgm_kernels.cpp b/test/multigrid/pgm_kernels.cpp index a5f2d32fe32..10e5cf01a7a 100644 --- a/test/multigrid/pgm_kernels.cpp +++ b/test/multigrid/pgm_kernels.cpp @@ -159,8 +159,8 @@ TEST_F(Pgm, MatchEdgeIsEquivalentToRef) auto d_x = d_unfinished_agg; gko::kernels::reference::pgm::match_edge(ref, strongest_neighbor, x); - gko::kernels::EXEC_NAMESPACE::pgm::match_edge(exec, d_strongest_neighbor, - d_x); + gko::kernels::GKO_DEVICE_NAMESPACE::pgm::match_edge( + exec, d_strongest_neighbor, d_x); GKO_ASSERT_ARRAY_EQ(d_x, x); } @@ -173,8 +173,8 @@ TEST_F(Pgm, CountUnaggIsEquivalentToRef) index_type d_num_unagg; gko::kernels::reference::pgm::count_unagg(ref, unfinished_agg, &num_unagg); - gko::kernels::EXEC_NAMESPACE::pgm::count_unagg(exec, d_unfinished_agg, - &d_num_unagg); + gko::kernels::GKO_DEVICE_NAMESPACE::pgm::count_unagg(exec, d_unfinished_agg, + &d_num_unagg); ASSERT_EQ(d_num_unagg, num_unagg); } @@ -187,7 +187,7 @@ TEST_F(Pgm, RenumberIsEquivalentToRef) index_type d_num_agg; gko::kernels::reference::pgm::renumber(ref, agg, &num_agg); - gko::kernels::EXEC_NAMESPACE::pgm::renumber(exec, d_agg, &d_num_agg); + gko::kernels::GKO_DEVICE_NAMESPACE::pgm::renumber(exec, d_agg, &d_num_agg); ASSERT_EQ(d_num_agg, num_agg); GKO_ASSERT_ARRAY_EQ(d_agg, agg); @@ -203,7 +203,7 @@ TEST_F(Pgm, FindStrongestNeighborIsEquivalentToRef) gko::kernels::reference::pgm::find_strongest_neighbor( ref, weight_csr.get(), weight_diag.get(), agg, snb); - gko::kernels::EXEC_NAMESPACE::pgm::find_strongest_neighbor( + gko::kernels::GKO_DEVICE_NAMESPACE::pgm::find_strongest_neighbor( exec, d_weight_csr.get(), d_weight_diag.get(), d_agg, d_snb); GKO_ASSERT_ARRAY_EQ(d_snb, snb); @@ -220,7 +220,7 @@ TEST_F(Pgm, AssignToExistAggIsEquivalentToRef) gko::kernels::reference::pgm::assign_to_exist_agg( ref, weight_csr.get(), weight_diag.get(), x, intermediate_agg); - gko::kernels::EXEC_NAMESPACE::pgm::assign_to_exist_agg( + gko::kernels::GKO_DEVICE_NAMESPACE::pgm::assign_to_exist_agg( exec, d_weight_csr.get(), d_weight_diag.get(), d_x, d_intermediate_agg); GKO_ASSERT_ARRAY_EQ(d_x, x); @@ -234,9 +234,10 @@ TEST_F(Pgm, AssignToExistAggUnderteminsticIsEquivalentToRef) auto d_intermediate_agg = gko::array(exec, 0); index_type d_num_unagg; - gko::kernels::EXEC_NAMESPACE::pgm::assign_to_exist_agg( + gko::kernels::GKO_DEVICE_NAMESPACE::pgm::assign_to_exist_agg( exec, d_weight_csr.get(), d_weight_diag.get(), d_x, d_intermediate_agg); - gko::kernels::EXEC_NAMESPACE::pgm::count_unagg(exec, d_agg, &d_num_unagg); + gko::kernels::GKO_DEVICE_NAMESPACE::pgm::count_unagg(exec, d_agg, + &d_num_unagg); // only test whether all elements are aggregated. GKO_ASSERT_EQ(d_num_unagg, 0); @@ -257,7 +258,7 @@ TEST_F(Pgm, GatherIndexIsEquivalentToRef) gko::kernels::reference::pgm::gather_index(ref, num, orig.get_const_data(), map.get_const_data(), result.get_data()); - gko::kernels::EXEC_NAMESPACE::pgm::gather_index( + gko::kernels::GKO_DEVICE_NAMESPACE::pgm::gather_index( exec, num, d_orig.get_const_data(), d_map.get_const_data(), d_result.get_data()); diff --git a/test/preconditioner/batch_jacobi_kernels.cpp b/test/preconditioner/batch_jacobi_kernels.cpp index 30dbfa271ee..f8a1bd015ef 100644 --- a/test/preconditioner/batch_jacobi_kernels.cpp +++ b/test/preconditioner/batch_jacobi_kernels.cpp @@ -117,7 +117,7 @@ class BatchJacobi : public CommonTestFixture { const gko::batch::BatchLinOp* prec, const Mtx* mtx, const MVec* b, MVec* x, LogData& log_data) { - gko::kernels::EXEC_NAMESPACE::batch_bicgstab::apply< + gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::apply< typename Mtx::value_type>(executor, settings, mtx, prec, b, x, log_data); }; diff --git a/test/preconditioner/isai_kernels.cpp b/test/preconditioner/isai_kernels.cpp index 57f8c14ac27..6e737d31790 100644 --- a/test/preconditioner/isai_kernels.cpp +++ b/test/preconditioner/isai_kernels.cpp @@ -122,7 +122,7 @@ TEST_F(Isai, IsaiGenerateLinverseShortIsEquivalentToRef) gko::kernels::reference::isai::generate_tri_inverse( ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); - gko::kernels::EXEC_NAMESPACE::isai::generate_tri_inverse( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_tri_inverse( exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), true); @@ -145,7 +145,7 @@ TEST_F(Isai, IsaiGenerateUinverseShortIsEquivalentToRef) gko::kernels::reference::isai::generate_tri_inverse( ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); - gko::kernels::EXEC_NAMESPACE::isai::generate_tri_inverse( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_tri_inverse( exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), false); @@ -168,7 +168,7 @@ TEST_F(Isai, IsaiGenerateAinverseShortIsEquivalentToRef) gko::kernels::reference::isai::generate_general_inverse( ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); - gko::kernels::EXEC_NAMESPACE::isai::generate_general_inverse( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_general_inverse( exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), false); @@ -191,7 +191,7 @@ TEST_F(Isai, IsaiGenerateSpdinverseShortIsEquivalentToRef) gko::kernels::reference::isai::generate_general_inverse( ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); - gko::kernels::EXEC_NAMESPACE::isai::generate_general_inverse( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_general_inverse( exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), true); @@ -214,7 +214,7 @@ TEST_F(Isai, IsaiGenerateLinverseLongIsEquivalentToRef) gko::kernels::reference::isai::generate_tri_inverse( ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); - gko::kernels::EXEC_NAMESPACE::isai::generate_tri_inverse( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_tri_inverse( exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), true); @@ -237,7 +237,7 @@ TEST_F(Isai, IsaiGenerateUinverseLongIsEquivalentToRef) gko::kernels::reference::isai::generate_tri_inverse( ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); - gko::kernels::EXEC_NAMESPACE::isai::generate_tri_inverse( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_tri_inverse( exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), false); @@ -260,7 +260,7 @@ TEST_F(Isai, IsaiGenerateAinverseLongIsEquivalentToRef) gko::kernels::reference::isai::generate_general_inverse( ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); - gko::kernels::EXEC_NAMESPACE::isai::generate_general_inverse( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_general_inverse( exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), false); @@ -283,7 +283,7 @@ TEST_F(Isai, IsaiGenerateSpdinverseLongIsEquivalentToRef) gko::kernels::reference::isai::generate_general_inverse( ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); - gko::kernels::EXEC_NAMESPACE::isai::generate_general_inverse( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_general_inverse( exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), false); @@ -315,7 +315,7 @@ TEST_F(Isai, IsaiGenerateExcessLinverseLongIsEquivalentToRef) gko::kernels::reference::isai::generate_excess_system( ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), excess.get(), e_rhs.get(), 0, num_rows); - gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system( exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(), da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows); @@ -346,7 +346,7 @@ TEST_F(Isai, IsaiGenerateExcessUinverseLongIsEquivalentToRef) gko::kernels::reference::isai::generate_excess_system( ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), excess.get(), e_rhs.get(), 0, num_rows); - gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system( exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(), da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows); @@ -377,7 +377,7 @@ TEST_F(Isai, IsaiGenerateExcessAinverseLongIsEquivalentToRef) gko::kernels::reference::isai::generate_excess_system( ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), excess.get(), e_rhs.get(), 0, num_rows); - gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system( exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(), da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows); @@ -408,7 +408,7 @@ TEST_F(Isai, IsaiGenerateExcessSpdinverseLongIsEquivalentToRef) gko::kernels::reference::isai::generate_excess_system( ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), excess.get(), e_rhs.get(), 0, num_rows); - gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system( exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(), da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows); @@ -439,7 +439,7 @@ TEST_F(Isai, IsaiGeneratePartialExcessIsEquivalentToRef) gko::kernels::reference::isai::generate_excess_system( ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), excess.get(), e_rhs.get(), 5u, 10u); - gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system( exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(), da2.get_const_data(), dexcess.get(), de_rhs.get(), 5u, 10u); @@ -467,7 +467,7 @@ TEST_F(Isai, IsaiScaleExcessSolutionIsEquivalentToRef) gko::kernels::reference::isai::scale_excess_solution( ref, a1.get_const_data(), e_rhs.get(), 0, num_rows); - gko::kernels::EXEC_NAMESPACE::isai::scale_excess_solution( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::scale_excess_solution( exec, da1.get_const_data(), de_rhs.get(), 0, num_rows); GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0); @@ -490,7 +490,7 @@ TEST_F(Isai, IsaiScalePartialExcessSolutionIsEquivalentToRef) gko::kernels::reference::isai::scale_excess_solution( ref, a1.get_const_data(), e_rhs.get(), 5u, 10u); - gko::kernels::EXEC_NAMESPACE::isai::scale_excess_solution( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::scale_excess_solution( exec, da1.get_const_data(), de_rhs.get(), 5u, 10u); GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0); @@ -514,7 +514,7 @@ TEST_F(Isai, IsaiScatterExcessSolutionLIsEquivalentToRef) gko::kernels::reference::isai::scatter_excess_solution( ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows); - gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution( exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, num_rows); GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); @@ -540,7 +540,7 @@ TEST_F(Isai, IsaiScatterExcessSolutionUIsEquivalentToRef) gko::kernels::reference::isai::scatter_excess_solution( ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows); - gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution( exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, num_rows); GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); @@ -566,7 +566,7 @@ TEST_F(Isai, IsaiScatterExcessSolutionAIsEquivalentToRef) gko::kernels::reference::isai::scatter_excess_solution( ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows); - gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution( exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, num_rows); GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); @@ -592,7 +592,7 @@ TEST_F(Isai, IsaiScatterExcessSolutionSpdIsEquivalentToRef) gko::kernels::reference::isai::scatter_excess_solution( ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows); - gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution( exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, num_rows); GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); @@ -618,7 +618,7 @@ TEST_F(Isai, IsaiScatterPartialExcessSolutionIsEquivalentToRef) gko::kernels::reference::isai::scatter_excess_solution( ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 5u, 10u); - gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution( exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 5u, 10u); GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); diff --git a/test/solver/batch_bicgstab_kernels.cpp b/test/solver/batch_bicgstab_kernels.cpp index 821a8a6d29c..14bca65e41f 100644 --- a/test/solver/batch_bicgstab_kernels.cpp +++ b/test/solver/batch_bicgstab_kernels.cpp @@ -52,7 +52,7 @@ class BatchBicgstab : public CommonTestFixture { const gko::batch::BatchLinOp* prec, const Mtx* mtx, const MVec* b, MVec* x, LogData& log_data) { - gko::kernels::EXEC_NAMESPACE::batch_bicgstab::apply< + gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::apply< typename Mtx::value_type>(executor, settings, mtx, prec, b, x, log_data); }; diff --git a/test/solver/batch_cg_kernels.cpp b/test/solver/batch_cg_kernels.cpp index 49f0db2a09b..7c013020686 100644 --- a/test/solver/batch_cg_kernels.cpp +++ b/test/solver/batch_cg_kernels.cpp @@ -50,7 +50,7 @@ class BatchCg : public CommonTestFixture { const gko::batch::BatchLinOp* prec, const Mtx* mtx, const MVec* b, MVec* x, LogData& log_data) { - gko::kernels::EXEC_NAMESPACE::batch_cg::apply< + gko::kernels::GKO_DEVICE_NAMESPACE::batch_cg::apply< typename Mtx::value_type>(executor, settings, mtx, prec, b, x, log_data); }; diff --git a/test/solver/bicg_kernels.cpp b/test/solver/bicg_kernels.cpp index 616f7eff096..ab63b01f9cc 100644 --- a/test/solver/bicg_kernels.cpp +++ b/test/solver/bicg_kernels.cpp @@ -139,7 +139,7 @@ TEST_F(Bicg, BicgInitializeIsEquivalentToRef) gko::kernels::reference::bicg::initialize( ref, b.get(), r.get(), z.get(), p.get(), q.get(), prev_rho.get(), rho.get(), r2.get(), z2.get(), p2.get(), q2.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::bicg::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::bicg::initialize( exec, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(), d_prev_rho.get(), d_rho.get(), d_r2.get(), d_z2.get(), d_p2.get(), d_q2.get(), d_stop_status.get()); @@ -165,7 +165,7 @@ TEST_F(Bicg, BicgStep1IsEquivalentToRef) gko::kernels::reference::bicg::step_1(ref, p.get(), z.get(), p2.get(), z2.get(), rho.get(), prev_rho.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::bicg::step_1( + gko::kernels::GKO_DEVICE_NAMESPACE::bicg::step_1( exec, d_p.get(), d_z.get(), d_p2.get(), d_z2.get(), d_rho.get(), d_prev_rho.get(), d_stop_status.get()); @@ -183,7 +183,7 @@ TEST_F(Bicg, BicgStep2IsEquivalentToRef) gko::kernels::reference::bicg::step_2( ref, x.get(), r.get(), r2.get(), p.get(), q.get(), q2.get(), beta.get(), rho.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::bicg::step_2( + gko::kernels::GKO_DEVICE_NAMESPACE::bicg::step_2( exec, d_x.get(), d_r.get(), d_r2.get(), d_p.get(), d_q.get(), d_q2.get(), d_beta.get(), d_rho.get(), d_stop_status.get()); diff --git a/test/solver/bicgstab_kernels.cpp b/test/solver/bicgstab_kernels.cpp index a63ff7f39f4..4f68edd6a8e 100644 --- a/test/solver/bicgstab_kernels.cpp +++ b/test/solver/bicgstab_kernels.cpp @@ -176,7 +176,7 @@ TEST_F(Bicgstab, BicgstabInitializeIsEquivalentToRef) ref, b.get(), r.get(), rr.get(), y.get(), s.get(), t.get(), z.get(), v.get(), p.get(), prev_rho.get(), rho.get(), alpha.get(), beta.get(), gamma.get(), omega.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::bicgstab::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::initialize( exec, d_b.get(), d_r.get(), d_rr.get(), d_y.get(), d_s.get(), d_t.get(), d_z.get(), d_v.get(), d_p.get(), d_prev_rho.get(), d_rho.get(), d_alpha.get(), d_beta.get(), d_gamma.get(), d_omega.get(), @@ -207,7 +207,7 @@ TEST_F(Bicgstab, BicgstabStep1IsEquivalentToRef) gko::kernels::reference::bicgstab::step_1( ref, r.get(), p.get(), v.get(), rho.get(), prev_rho.get(), alpha.get(), omega.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::bicgstab::step_1( + gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::step_1( exec, d_r.get(), d_p.get(), d_v.get(), d_rho.get(), d_prev_rho.get(), d_alpha.get(), d_omega.get(), d_stop_status.get()); @@ -222,7 +222,7 @@ TEST_F(Bicgstab, BicgstabStep2IsEquivalentToRef) gko::kernels::reference::bicgstab::step_2(ref, r.get(), s.get(), v.get(), rho.get(), alpha.get(), beta.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::bicgstab::step_2( + gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::step_2( exec, d_r.get(), d_s.get(), d_v.get(), d_rho.get(), d_alpha.get(), d_beta.get(), d_stop_status.get()); @@ -238,7 +238,7 @@ TEST_F(Bicgstab, BicgstabStep3IsEquivalentToRef) gko::kernels::reference::bicgstab::step_3( ref, x.get(), r.get(), s.get(), t.get(), y.get(), z.get(), alpha.get(), beta.get(), gamma.get(), omega.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::bicgstab::step_3( + gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::step_3( exec, d_x.get(), d_r.get(), d_s.get(), d_t.get(), d_y.get(), d_z.get(), d_alpha.get(), d_beta.get(), d_gamma.get(), d_omega.get(), d_stop_status.get()); diff --git a/test/solver/cb_gmres_kernels.cpp b/test/solver/cb_gmres_kernels.cpp index 4f854a26180..3b5f5956c2e 100644 --- a/test/solver/cb_gmres_kernels.cpp +++ b/test/solver/cb_gmres_kernels.cpp @@ -209,7 +209,7 @@ TEST_F(CbGmres, CbGmresInitialize1IsEquivalentToRef) gko::kernels::reference::cb_gmres::initialize( ref, b.get(), residual.get(), givens_sin.get(), givens_cos.get(), stop_status.get(), default_krylov_dim_mixed); - gko::kernels::EXEC_NAMESPACE::cb_gmres::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::cb_gmres::initialize( exec, d_b.get(), d_residual.get(), d_givens_sin.get(), d_givens_cos.get(), d_stop_status.get(), default_krylov_dim_mixed); @@ -230,7 +230,7 @@ TEST_F(CbGmres, CbGmresInitialize2IsEquivalentToRef) residual_norm_collection.get(), arnoldi_norm.get(), range_helper.get_range(), next_krylov_basis.get(), final_iter_nums.get(), tmp, default_krylov_dim_mixed); - gko::kernels::EXEC_NAMESPACE::cb_gmres::restart( + gko::kernels::GKO_DEVICE_NAMESPACE::cb_gmres::restart( exec, d_residual.get(), d_residual_norm.get(), d_residual_norm_collection.get(), d_arnoldi_norm.get(), d_range_helper.get_range(), d_next_krylov_basis.get(), @@ -255,7 +255,7 @@ TEST_F(CbGmres, CbGmresStep1IsEquivalentToRef) range_helper.get_range(), hessenberg_iter.get(), buffer_iter.get(), arnoldi_norm.get(), iter, final_iter_nums.get(), stop_status.get(), reorth_status.get(), num_reorth.get()); - gko::kernels::EXEC_NAMESPACE::cb_gmres::arnoldi( + gko::kernels::GKO_DEVICE_NAMESPACE::cb_gmres::arnoldi( exec, d_next_krylov_basis.get(), d_givens_sin.get(), d_givens_cos.get(), d_residual_norm.get(), d_residual_norm_collection.get(), d_range_helper.get_range(), d_hessenberg_iter.get(), @@ -285,7 +285,7 @@ TEST_F(CbGmres, CbGmresStep2IsEquivalentToRef) ref, residual_norm_collection.get(), range_helper.get_range().get_accessor().to_const(), hessenberg.get(), y.get(), before_preconditioner.get(), final_iter_nums.get()); - gko::kernels::EXEC_NAMESPACE::cb_gmres::solve_krylov( + gko::kernels::GKO_DEVICE_NAMESPACE::cb_gmres::solve_krylov( exec, d_residual_norm_collection.get(), d_range_helper.get_range().get_accessor().to_const(), d_hessenberg.get(), d_y.get(), d_before_preconditioner.get(), diff --git a/test/solver/cg_kernels.cpp b/test/solver/cg_kernels.cpp index 41af16489a2..be9dc052314 100644 --- a/test/solver/cg_kernels.cpp +++ b/test/solver/cg_kernels.cpp @@ -114,7 +114,7 @@ TEST_F(Cg, CgInitializeIsEquivalentToRef) gko::kernels::reference::cg::initialize(ref, b.get(), r.get(), z.get(), p.get(), q.get(), prev_rho.get(), rho.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::cg::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::cg::initialize( exec, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(), d_prev_rho.get(), d_rho.get(), d_stop_status.get()); @@ -134,9 +134,9 @@ TEST_F(Cg, CgStep1IsEquivalentToRef) gko::kernels::reference::cg::step_1(ref, p.get(), z.get(), rho.get(), prev_rho.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::cg::step_1(exec, d_p.get(), d_z.get(), - d_rho.get(), d_prev_rho.get(), - d_stop_status.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::cg::step_1( + exec, d_p.get(), d_z.get(), d_rho.get(), d_prev_rho.get(), + d_stop_status.get()); GKO_ASSERT_MTX_NEAR(d_p, p, ::r::value); GKO_ASSERT_MTX_NEAR(d_z, z, ::r::value); @@ -149,9 +149,9 @@ TEST_F(Cg, CgStep2IsEquivalentToRef) gko::kernels::reference::cg::step_2(ref, x.get(), r.get(), p.get(), q.get(), beta.get(), rho.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::cg::step_2(exec, d_x.get(), d_r.get(), - d_p.get(), d_q.get(), d_beta.get(), - d_rho.get(), d_stop_status.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::cg::step_2( + exec, d_x.get(), d_r.get(), d_p.get(), d_q.get(), d_beta.get(), + d_rho.get(), d_stop_status.get()); GKO_ASSERT_MTX_NEAR(d_x, x, ::r::value); GKO_ASSERT_MTX_NEAR(d_r, r, ::r::value); diff --git a/test/solver/cgs_kernels.cpp b/test/solver/cgs_kernels.cpp index 123f76727b5..6c2bab293e3 100644 --- a/test/solver/cgs_kernels.cpp +++ b/test/solver/cgs_kernels.cpp @@ -167,7 +167,7 @@ TEST_F(Cgs, CgsInitializeIsEquivalentToRef) ref, b.get(), r.get(), r_tld.get(), p.get(), q.get(), u.get(), u_hat.get(), v_hat.get(), t.get(), alpha.get(), beta.get(), gamma.get(), rho_prev.get(), rho.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::cgs::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::cgs::initialize( exec, d_b.get(), d_r.get(), d_r_tld.get(), d_p.get(), d_q.get(), d_u.get(), d_u_hat.get(), d_v_hat.get(), d_t.get(), d_alpha.get(), d_beta.get(), d_gamma.get(), d_rho_prev.get(), d_rho.get(), @@ -197,7 +197,7 @@ TEST_F(Cgs, CgsStep1IsEquivalentToRef) gko::kernels::reference::cgs::step_1(ref, r.get(), u.get(), p.get(), q.get(), beta.get(), rho.get(), rho_prev.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::cgs::step_1( + gko::kernels::GKO_DEVICE_NAMESPACE::cgs::step_1( exec, d_r.get(), d_u.get(), d_p.get(), d_q.get(), d_beta.get(), d_rho.get(), d_rho_prev.get(), d_stop_status.get()); @@ -214,7 +214,7 @@ TEST_F(Cgs, CgsStep2IsEquivalentToRef) gko::kernels::reference::cgs::step_2(ref, u.get(), v_hat.get(), q.get(), t.get(), alpha.get(), rho.get(), gamma.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::cgs::step_2( + gko::kernels::GKO_DEVICE_NAMESPACE::cgs::step_2( exec, d_u.get(), d_v_hat.get(), d_q.get(), d_t.get(), d_alpha.get(), d_rho.get(), d_gamma.get(), d_stop_status.get()); @@ -231,7 +231,7 @@ TEST_F(Cgs, CgsStep3IsEquivalentToRef) gko::kernels::reference::cgs::step_3(ref, t.get(), u_hat.get(), r.get(), x.get(), alpha.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::cgs::step_3( + gko::kernels::GKO_DEVICE_NAMESPACE::cgs::step_3( exec, d_t.get(), d_u_hat.get(), d_r.get(), d_x.get(), d_alpha.get(), d_stop_status.get()); diff --git a/test/solver/fcg_kernels.cpp b/test/solver/fcg_kernels.cpp index faf7225c883..f1f09f759bc 100644 --- a/test/solver/fcg_kernels.cpp +++ b/test/solver/fcg_kernels.cpp @@ -122,7 +122,7 @@ TEST_F(Fcg, FcgInitializeIsEquivalentToRef) gko::kernels::reference::fcg::initialize( ref, b.get(), r.get(), z.get(), p.get(), q.get(), t.get(), prev_rho.get(), rho.get(), rho_t.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::fcg::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::fcg::initialize( exec, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(), d_t.get(), d_prev_rho.get(), d_rho.get(), d_rho_t.get(), d_stop_status.get()); @@ -144,9 +144,9 @@ TEST_F(Fcg, FcgStep1IsEquivalentToRef) gko::kernels::reference::fcg::step_1(ref, p.get(), z.get(), rho_t.get(), prev_rho.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::fcg::step_1(exec, d_p.get(), d_z.get(), - d_rho_t.get(), d_prev_rho.get(), - d_stop_status.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::fcg::step_1( + exec, d_p.get(), d_z.get(), d_rho_t.get(), d_prev_rho.get(), + d_stop_status.get()); GKO_ASSERT_MTX_NEAR(d_p, p, ::r::value); GKO_ASSERT_MTX_NEAR(d_z, z, ::r::value); @@ -159,7 +159,7 @@ TEST_F(Fcg, FcgStep2IsEquivalentToRef) gko::kernels::reference::fcg::step_2(ref, x.get(), r.get(), t.get(), p.get(), q.get(), beta.get(), rho.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::fcg::step_2( + gko::kernels::GKO_DEVICE_NAMESPACE::fcg::step_2( exec, d_x.get(), d_r.get(), d_t.get(), d_p.get(), d_q.get(), d_beta.get(), d_rho.get(), d_stop_status.get()); diff --git a/test/solver/gcr_kernels.cpp b/test/solver/gcr_kernels.cpp index 575d55ded87..7a00b3fed30 100644 --- a/test/solver/gcr_kernels.cpp +++ b/test/solver/gcr_kernels.cpp @@ -153,7 +153,7 @@ TEST_F(Gcr, GcrKernelInitializeIsEquivalentToRef) gko::kernels::reference::gcr::initialize(ref, b.get(), residual.get(), stop_status.get_data()); - gko::kernels::EXEC_NAMESPACE::gcr::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::gcr::initialize( exec, d_b.get(), d_residual.get(), d_stop_status.get_data()); GKO_ASSERT_MTX_NEAR(d_residual, residual, r::value); @@ -168,7 +168,7 @@ TEST_F(Gcr, GcrKernelRestartIsEquivalentToRef) gko::kernels::reference::gcr::restart(ref, residual.get(), A_residual.get(), p_bases.get(), Ap_bases.get(), final_iter_nums.get_data()); - gko::kernels::EXEC_NAMESPACE::gcr::restart( + gko::kernels::GKO_DEVICE_NAMESPACE::gcr::restart( exec, d_residual.get(), d_A_residual.get(), d_p_bases.get(), d_Ap_bases.get(), d_final_iter_nums.get_data()); @@ -186,7 +186,7 @@ TEST_F(Gcr, GcrStep1IsEquivalentToRef) gko::kernels::reference::gcr::step_1(ref, x.get(), residual.get(), p.get(), Ap.get(), Ap_norm.get(), rAp.get(), stop_status.get_data()); - gko::kernels::EXEC_NAMESPACE::gcr::step_1( + gko::kernels::GKO_DEVICE_NAMESPACE::gcr::step_1( exec, d_x.get(), d_residual.get(), d_p.get(), d_Ap.get(), d_Ap_norm.get(), d_rAp.get(), d_stop_status.get_data()); diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp index ac9139d81aa..08259c91ce0 100644 --- a/test/solver/gmres_kernels.cpp +++ b/test/solver/gmres_kernels.cpp @@ -159,7 +159,7 @@ TEST_F(Gmres, GmresKernelInitializeIsEquivalentToRef) gko::kernels::reference::common_gmres::initialize( ref, b.get(), residual.get(), givens_sin.get(), givens_cos.get(), stop_status.get_data()); - gko::kernels::EXEC_NAMESPACE::common_gmres::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::common_gmres::initialize( exec, d_b.get(), d_residual.get(), d_givens_sin.get(), d_givens_cos.get(), d_stop_status.get_data()); @@ -180,7 +180,7 @@ TEST_F(Gmres, GmresKernelRestartIsEquivalentToRef) ref, residual.get(), residual_norm.get(), residual_norm_collection.get(), krylov_bases.get(), final_iter_nums.get_data()); - gko::kernels::EXEC_NAMESPACE::gmres::restart( + gko::kernels::GKO_DEVICE_NAMESPACE::gmres::restart( exec, d_residual.get(), d_residual_norm.get(), d_residual_norm_collection.get(), d_krylov_bases.get(), d_final_iter_nums.get_data()); @@ -202,7 +202,7 @@ TEST_F(Gmres, GmresKernelHessenbergQRIsEquivalentToRef) ref, givens_sin.get(), givens_cos.get(), residual_norm.get(), residual_norm_collection.get(), hessenberg_iter.get(), iter, final_iter_nums.get_data(), stop_status.get_const_data()); - gko::kernels::EXEC_NAMESPACE::common_gmres::hessenberg_qr( + gko::kernels::GKO_DEVICE_NAMESPACE::common_gmres::hessenberg_qr( exec, d_givens_sin.get(), d_givens_cos.get(), d_residual_norm.get(), d_residual_norm_collection.get(), d_hessenberg_iter.get(), iter, d_final_iter_nums.get_data(), d_stop_status.get_const_data()); @@ -228,7 +228,7 @@ TEST_F(Gmres, GmresKernelHessenbergQROnSingleRHSIsEquivalentToRef) ref, givens_sin.get(), givens_cos.get(), residual_norm.get(), residual_norm_collection.get(), hessenberg_iter.get(), iter, final_iter_nums.get_data(), stop_status.get_const_data()); - gko::kernels::EXEC_NAMESPACE::common_gmres::hessenberg_qr( + gko::kernels::GKO_DEVICE_NAMESPACE::common_gmres::hessenberg_qr( exec, d_givens_sin.get(), d_givens_cos.get(), d_residual_norm.get(), d_residual_norm_collection.get(), d_hessenberg_iter.get(), iter, d_final_iter_nums.get_data(), d_stop_status.get_const_data()); @@ -252,7 +252,7 @@ TEST_F(Gmres, GmresKernelSolveKrylovIsEquivalentToRef) gko::kernels::reference::common_gmres::solve_krylov( ref, residual_norm_collection.get(), hessenberg.get(), y.get(), final_iter_nums.get_const_data(), stop_status.get_const_data()); - gko::kernels::EXEC_NAMESPACE::common_gmres::solve_krylov( + gko::kernels::GKO_DEVICE_NAMESPACE::common_gmres::solve_krylov( exec, d_residual_norm_collection.get(), d_hessenberg.get(), d_y.get(), d_final_iter_nums.get_const_data(), d_stop_status.get_const_data()); @@ -267,7 +267,7 @@ TEST_F(Gmres, GmresKernelMultiAxpyIsEquivalentToRef) gko::kernels::reference::gmres::multi_axpy( ref, krylov_bases.get(), y.get(), before_preconditioner.get(), final_iter_nums.get_const_data(), stop_status.get_data()); - gko::kernels::EXEC_NAMESPACE::gmres::multi_axpy( + gko::kernels::GKO_DEVICE_NAMESPACE::gmres::multi_axpy( exec, d_krylov_bases.get(), d_y.get(), d_before_preconditioner.get(), d_final_iter_nums.get_const_data(), d_stop_status.get_data()); diff --git a/test/solver/idr_kernels.cpp b/test/solver/idr_kernels.cpp index 31c7df99168..b165824dbe0 100644 --- a/test/solver/idr_kernels.cpp +++ b/test/solver/idr_kernels.cpp @@ -160,7 +160,7 @@ TEST_F(Idr, IdrInitializeIsEquivalentToRef) gko::kernels::reference::idr::initialize(ref, nrhs, m.get(), p.get(), true, stop_status.get()); - gko::kernels::EXEC_NAMESPACE::idr::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::idr::initialize( exec, nrhs, d_m.get(), d_p.get(), true, d_stop_status.get()); GKO_ASSERT_MTX_NEAR(m, d_m, rr::value); @@ -176,7 +176,7 @@ TEST_F(Idr, IdrStep1IsEquivalentToRef) gko::kernels::reference::idr::step_1(ref, nrhs, k, m.get(), f.get(), r.get(), g.get(), c.get(), v.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::idr::step_1( + gko::kernels::GKO_DEVICE_NAMESPACE::idr::step_1( exec, nrhs, k, d_m.get(), d_f.get(), d_r.get(), d_g.get(), d_c.get(), d_v.get(), d_stop_status.get()); @@ -192,9 +192,9 @@ TEST_F(Idr, IdrStep2IsEquivalentToRef) gko::size_type k = 2; gko::kernels::reference::idr::step_2(ref, nrhs, k, omega.get(), v.get(), c.get(), u.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::idr::step_2(exec, nrhs, k, d_omega.get(), - d_v.get(), d_c.get(), d_u.get(), - d_stop_status.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::idr::step_2( + exec, nrhs, k, d_omega.get(), d_v.get(), d_c.get(), d_u.get(), + d_stop_status.get()); GKO_ASSERT_MTX_NEAR(u, d_u, rr::value); } @@ -208,7 +208,7 @@ TEST_F(Idr, IdrStep3IsEquivalentToRef) gko::kernels::reference::idr::step_3( ref, nrhs, k, p.get(), g.get(), v.get(), u.get(), m.get(), f.get(), alpha.get(), r.get(), x.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::idr::step_3( + gko::kernels::GKO_DEVICE_NAMESPACE::idr::step_3( exec, nrhs, k, d_p.get(), d_g.get(), d_v.get(), d_u.get(), d_m.get(), d_f.get(), d_alpha.get(), d_r.get(), d_x.get(), d_stop_status.get()); @@ -230,7 +230,7 @@ TEST_F(Idr, IdrComputeOmegaIsEquivalentToRef) gko::kernels::reference::idr::compute_omega(ref, nrhs, kappa, tht.get(), residual_norm.get(), omega.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::idr::compute_omega( + gko::kernels::GKO_DEVICE_NAMESPACE::idr::compute_omega( exec, nrhs, kappa, d_tht.get(), d_residual_norm.get(), d_omega.get(), d_stop_status.get()); diff --git a/test/solver/ir_kernels.cpp b/test/solver/ir_kernels.cpp index 99550dfd99f..7a8e84324bd 100644 --- a/test/solver/ir_kernels.cpp +++ b/test/solver/ir_kernels.cpp @@ -55,7 +55,7 @@ TEST_F(Ir, InitializeIsEquivalentToRef) auto d_stop_status = gko::array(exec, stop_status); gko::kernels::reference::ir::initialize(ref, &stop_status); - gko::kernels::EXEC_NAMESPACE::ir::initialize(exec, &d_stop_status); + gko::kernels::GKO_DEVICE_NAMESPACE::ir::initialize(exec, &d_stop_status); auto tmp = gko::array(ref, d_stop_status); for (int i = 0; i < stop_status.get_size(); ++i) { diff --git a/test/solver/multigrid_kernels.cpp b/test/solver/multigrid_kernels.cpp index 139cb1a4647..4b4b0157df5 100644 --- a/test/solver/multigrid_kernels.cpp +++ b/test/solver/multigrid_kernels.cpp @@ -144,7 +144,7 @@ TEST_F(Multigrid, MultigridKCycleStep1IsEquivalentToRef) gko::kernels::reference::multigrid::kcycle_step_1( ref, alpha.get(), rho.get(), v.get(), g.get(), d.get(), e.get()); - gko::kernels::EXEC_NAMESPACE::multigrid::kcycle_step_1( + gko::kernels::GKO_DEVICE_NAMESPACE::multigrid::kcycle_step_1( exec, d_alpha.get(), d_rho.get(), d_v.get(), d_g.get(), d_d.get(), d_e.get()); @@ -161,7 +161,7 @@ TEST_F(Multigrid, MultigridKCycleStep2IsEquivalentToRef) gko::kernels::reference::multigrid::kcycle_step_2( ref, alpha.get(), rho.get(), gamma.get(), beta.get(), zeta.get(), d.get(), e.get()); - gko::kernels::EXEC_NAMESPACE::multigrid::kcycle_step_2( + gko::kernels::GKO_DEVICE_NAMESPACE::multigrid::kcycle_step_2( exec, d_alpha.get(), d_rho.get(), d_gamma.get(), d_beta.get(), d_zeta.get(), d_d.get(), d_e.get()); @@ -179,11 +179,11 @@ TEST_F(Multigrid, MultigridKCycleCheckStopIsEquivalentToRef) gko::kernels::reference::multigrid::kcycle_check_stop( ref, old_norm.get(), new_norm.get(), 1.0, is_stop_10); - gko::kernels::EXEC_NAMESPACE::multigrid::kcycle_check_stop( + gko::kernels::GKO_DEVICE_NAMESPACE::multigrid::kcycle_check_stop( exec, d_old_norm.get(), d_new_norm.get(), 1.0, d_is_stop_10); gko::kernels::reference::multigrid::kcycle_check_stop( ref, old_norm.get(), new_norm.get(), 0.5, is_stop_5); - gko::kernels::EXEC_NAMESPACE::multigrid::kcycle_check_stop( + gko::kernels::GKO_DEVICE_NAMESPACE::multigrid::kcycle_check_stop( exec, d_old_norm.get(), d_new_norm.get(), 0.5, d_is_stop_5); GKO_ASSERT_EQ(d_is_stop_10, is_stop_10); From ae72a92ebfee0556313645ffca64bea74f5bf4a1 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 21 Feb 2024 17:37:42 +0100 Subject: [PATCH 007/448] prepare for unification - Add necessary switching headers - Provide device namespace macro via compiler definitions - Add necessary (namespace) aliases - adapt math lib includes and namespaces - uniformize files --- accessor/cuda_hip_helper.hpp | 38 ++ cmake/create_test.cmake | 12 +- common/cuda_hip/base/blas_bindings.hpp | 16 + common/cuda_hip/base/config.hpp | 16 + common/cuda_hip/base/pointer_mode_guard.hpp | 16 + common/cuda_hip/base/randlib_bindings.hpp | 16 + common/cuda_hip/base/runtime.hpp | 14 + common/cuda_hip/base/sparselib_bindings.hpp | 16 + common/cuda_hip/base/thrust.hpp | 51 +++ common/cuda_hip/base/types.hpp | 9 + common/cuda_hip/components/atomic.hpp.inc | 32 ++ .../components/cooperative_groups.hpp | 16 + .../cuda_hip/components/format_conversion.hpp | 16 + common/cuda_hip/components/memory.hpp | 16 + ...ernels.hpp.inc => par_ict_kernels.hpp.inc} | 68 ++++ .../par_ict_sweep_kernels.hpp.inc | 76 ---- ... => jacobi_advanced_apply_kernels.hpp.inc} | 0 ...pp.inc => jacobi_generate_kernels.hpp.inc} | 0 ...nc => jacobi_simple_apply_kernels.hpp.inc} | 0 common/unified/base/kernel_launch.hpp | 4 +- core/test/gtest/CMakeLists.txt | 8 +- cuda/CMakeLists.txt | 4 +- cuda/base/batch_multi_vector_kernels.cu | 10 +- cuda/base/batch_struct.hpp | 4 +- cuda/base/cublas_bindings.hpp | 16 +- cuda/base/curand_bindings.hpp | 14 +- cuda/base/cusparse_bindings.hpp | 20 +- cuda/base/cusparse_block_bindings.hpp | 2 +- cuda/base/device_matrix_data_kernels.cu | 2 +- cuda/base/executor.cpp | 2 +- cuda/base/kernel_launch.cuh | 13 +- cuda/base/kernel_launch_reduction.cuh | 4 +- cuda/base/kernel_launch_solver.cuh | 3 + cuda/base/types.hpp | 4 + cuda/components/atomic.cuh | 34 +- cuda/components/cooperative_groups.cuh | 2 +- .../diagonal_block_manipulation.cuh | 6 +- cuda/components/format_conversion.cuh | 2 +- cuda/components/memory.cuh | 2 +- cuda/components/prefix_sum.cuh | 4 +- cuda/components/reduction.cuh | 7 +- cuda/components/searching.cuh | 2 +- cuda/components/segment_scan.cuh | 2 +- cuda/components/sorting.cuh | 4 +- cuda/components/syncfree.cuh | 6 +- cuda/components/thread_ids.cuh | 7 +- cuda/distributed/vector_kernels.cu | 3 + cuda/factorization/cholesky_kernels.cu | 18 +- cuda/factorization/factorization_kernels.cu | 7 +- cuda/factorization/ic_kernels.cu | 32 +- cuda/factorization/ilu_kernels.cu | 32 +- cuda/factorization/lu_kernels.cu | 4 +- cuda/factorization/par_ic_kernels.cu | 4 +- cuda/factorization/par_ict_kernels.cu | 6 +- cuda/factorization/par_ilu_kernels.cu | 6 +- ...l.cu => par_ilut_approx_filter_kernels.cu} | 6 +- ...r_kernel.cu => par_ilut_filter_kernels.cu} | 7 +- ...t_kernel.cu => par_ilut_select_kernels.cu} | 3 +- ...m_kernel.cu => par_ilut_spgeam_kernels.cu} | 7 +- ...ep_kernel.cu => par_ilut_sweep_kernels.cu} | 3 +- cuda/log/batch_logger.cuh | 1 + cuda/matrix/batch_csr_kernels.cu | 5 +- cuda/matrix/batch_dense_kernels.cu | 6 +- cuda/matrix/batch_ell_kernels.cu | 5 +- cuda/matrix/batch_struct.hpp | 2 +- cuda/matrix/coo_kernels.cu | 16 +- cuda/matrix/csr_kernels.template.cu | 374 +++++++++--------- cuda/matrix/dense_kernels.cu | 99 ++--- cuda/matrix/diagonal_kernels.cu | 7 +- cuda/matrix/ell_kernels.cu | 38 +- cuda/matrix/fbcsr_kernels.template.cu | 92 +++-- cuda/matrix/sellp_kernels.cu | 7 +- cuda/matrix/sparsity_csr_kernels.cu | 37 +- cuda/multigrid/pgm_kernels.cu | 4 +- cuda/preconditioner/batch_preconditioners.cuh | 2 +- cuda/preconditioner/isai_kernels.cu | 7 +- ...el.cu => jacobi_advanced_apply_kernels.cu} | 0 ...obi_advanced_apply_kernels.instantiate.cu} | 8 +- cuda/preconditioner/jacobi_common.hpp.in | 2 +- ...e_kernel.cu => jacobi_generate_kernels.cu} | 0 ...=> jacobi_generate_kernels.instantiate.cu} | 8 +- cuda/preconditioner/jacobi_kernels.cu | 13 +- ...rnel.cu => jacobi_simple_apply_kernels.cu} | 0 ...acobi_simple_apply_kernels.instantiate.cu} | 8 +- cuda/reorder/rcm_kernels.cu | 2 +- cuda/solver/batch_bicgstab_kernels.cu | 7 +- cuda/solver/batch_cg_kernels.cu | 6 +- cuda/solver/cb_gmres_kernels.cu | 42 +- cuda/solver/common_trs_kernels.cuh | 88 ++--- cuda/solver/idr_kernels.cu | 30 +- cuda/solver/lower_trs_kernels.cu | 4 +- cuda/solver/multigrid_kernels.cu | 3 +- cuda/solver/upper_trs_kernels.cu | 4 +- cuda/stop/criterion_kernels.cu | 2 +- cuda/stop/residual_norm_kernels.cu | 2 +- cuda/test/base/math.cu | 2 +- cuda/test/components/cooperative_groups.cu | 6 +- cuda/test/components/merging.cu | 2 +- cuda/test/components/searching.cu | 2 +- dpcpp/CMakeLists.txt | 4 +- dpcpp/test/base/CMakeLists.txt | 2 +- hip/CMakeLists.txt | 4 +- hip/base/batch_multi_vector_kernels.hip.cpp | 10 +- hip/base/batch_struct.hip.hpp | 4 +- hip/base/config.hip.hpp | 4 +- hip/base/device.hip.cpp | 4 +- hip/base/device_matrix_data_kernels.hip.cpp | 2 +- hip/base/exception.hip.cpp | 2 +- hip/base/executor.hip.cpp | 6 +- hip/base/hipblas_bindings.hip.hpp | 18 +- hip/base/hiprand_bindings.hip.hpp | 16 +- hip/base/hipsparse_bindings.hip.hpp | 18 +- hip/base/hipsparse_block_bindings.hip.hpp | 4 +- hip/base/kernel_launch.hip.hpp | 14 +- hip/base/kernel_launch_reduction.hip.hpp | 4 +- hip/base/kernel_launch_solver.hip.hpp | 2 +- hip/base/memory.hip.cpp | 4 +- hip/base/pointer_mode_guard.hip.hpp | 2 +- hip/base/roctx.hip.cpp | 4 +- hip/base/scoped_device_id.hip.cpp | 4 +- hip/base/stream.hip.cpp | 4 +- hip/base/timer.hip.cpp | 4 +- hip/base/types.hip.hpp | 8 +- hip/components/atomic.hip.hpp | 34 +- hip/components/cooperative_groups.hip.hpp | 4 +- .../diagonal_block_manipulation.hip.hpp | 6 +- hip/components/format_conversion.hip.hpp | 6 +- hip/components/memory.hip.hpp | 2 +- hip/components/prefix_sum.hip.hpp | 4 +- hip/components/reduction.hip.hpp | 10 +- hip/components/searching.hip.hpp | 2 +- hip/components/segment_scan.hip.hpp | 2 +- hip/components/sorting.hip.hpp | 4 +- hip/components/syncfree.hip.hpp | 6 +- hip/components/thread_ids.hip.hpp | 7 +- hip/factorization/cholesky_kernels.hip.cpp | 14 +- .../factorization_kernels.hip.cpp | 9 +- hip/factorization/ic_kernels.hip.cpp | 26 +- hip/factorization/ilu_kernels.hip.cpp | 26 +- hip/factorization/lu_kernels.hip.cpp | 4 +- hip/factorization/par_ic_kernels.hip.cpp | 4 +- hip/factorization/par_ict_kernels.hip.cpp | 9 +- hip/factorization/par_ilu_kernels.hip.cpp | 8 +- ...=> par_ilut_approx_filter_kernels.hip.cpp} | 10 +- ...ip.cpp => par_ilut_filter_kernels.hip.cpp} | 10 +- .../par_ilut_select_common.hip.cpp | 2 +- ...ip.cpp => par_ilut_select_kernels.hip.cpp} | 4 +- ...ip.cpp => par_ilut_spgeam_kernels.hip.cpp} | 6 +- ...hip.cpp => par_ilut_sweep_kernels.hip.cpp} | 7 +- hip/matrix/batch_csr_kernels.hip.cpp | 6 +- hip/matrix/batch_dense_kernels.hip.cpp | 8 +- hip/matrix/batch_ell_kernels.hip.cpp | 6 +- hip/matrix/batch_struct.hip.hpp | 2 +- hip/matrix/coo_kernels.hip.cpp | 19 +- hip/matrix/csr_kernels.template.hip.cpp | 153 +++---- hip/matrix/dense_kernels.hip.cpp | 103 +++-- hip/matrix/diagonal_kernels.hip.cpp | 10 +- hip/matrix/ell_kernels.hip.cpp | 35 +- hip/matrix/fbcsr_kernels.template.hip.cpp | 119 ++++-- hip/matrix/fft_kernels.hip.cpp | 2 +- hip/matrix/sellp_kernels.hip.cpp | 10 +- hip/matrix/sparsity_csr_kernels.hip.cpp | 34 +- hip/multigrid/pgm_kernels.hip.cpp | 2 +- .../batch_preconditioners.hip.hpp | 2 +- hip/preconditioner/isai_kernels.hip.cpp | 11 +- ...obi_advanced_apply_instantiate.inc.hip.cpp | 12 +- hip/preconditioner/jacobi_common.hip.hpp.in | 2 +- .../jacobi_generate_instantiate.inc.hip.cpp | 8 +- .../jacobi_generate_kernel.hip.cpp | 12 +- hip/preconditioner/jacobi_kernels.hip.cpp | 14 +- ...acobi_simple_apply_instantiate.inc.hip.cpp | 8 +- .../jacobi_simple_apply_kernel.hip.cpp | 12 +- hip/reorder/rcm_kernels.hip.cpp | 2 +- hip/solver/batch_bicgstab_kernels.hip.cpp | 8 +- hip/solver/batch_cg_kernels.hip.cpp | 8 +- hip/solver/cb_gmres_kernels.hip.cpp | 36 +- hip/solver/common_trs_kernels.hip.hpp | 30 +- hip/solver/idr_kernels.hip.cpp | 30 +- hip/solver/lower_trs_kernels.hip.cpp | 6 +- hip/solver/multigrid_kernels.hip.cpp | 6 +- hip/solver/upper_trs_kernels.hip.cpp | 6 +- hip/stop/criterion_kernels.hip.cpp | 2 +- hip/stop/residual_norm_kernels.hip.cpp | 6 +- hip/test/base/math.hip.cpp | 2 +- .../components/cooperative_groups.hip.cpp | 6 +- hip/test/components/merging.hip.cpp | 2 +- hip/test/components/searching.hip.cpp | 2 +- include/ginkgo/core/base/executor.hpp | 26 ++ omp/CMakeLists.txt | 4 +- 189 files changed, 1592 insertions(+), 1251 deletions(-) create mode 100644 accessor/cuda_hip_helper.hpp create mode 100644 common/cuda_hip/base/blas_bindings.hpp create mode 100644 common/cuda_hip/base/config.hpp create mode 100644 common/cuda_hip/base/pointer_mode_guard.hpp create mode 100644 common/cuda_hip/base/randlib_bindings.hpp create mode 100644 common/cuda_hip/base/runtime.hpp create mode 100644 common/cuda_hip/base/sparselib_bindings.hpp create mode 100644 common/cuda_hip/base/thrust.hpp create mode 100644 common/cuda_hip/base/types.hpp create mode 100644 common/cuda_hip/components/cooperative_groups.hpp create mode 100644 common/cuda_hip/components/format_conversion.hpp create mode 100644 common/cuda_hip/components/memory.hpp rename common/cuda_hip/factorization/{par_ict_spgeam_kernels.hpp.inc => par_ict_kernels.hpp.inc} (75%) delete mode 100644 common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc rename common/cuda_hip/preconditioner/{jacobi_advanced_apply_kernel.hpp.inc => jacobi_advanced_apply_kernels.hpp.inc} (100%) rename common/cuda_hip/preconditioner/{jacobi_generate_kernel.hpp.inc => jacobi_generate_kernels.hpp.inc} (100%) rename common/cuda_hip/preconditioner/{jacobi_simple_apply_kernel.hpp.inc => jacobi_simple_apply_kernels.hpp.inc} (100%) rename cuda/factorization/{par_ilut_approx_filter_kernel.cu => par_ilut_approx_filter_kernels.cu} (97%) rename cuda/factorization/{par_ilut_filter_kernel.cu => par_ilut_filter_kernels.cu} (96%) rename cuda/factorization/{par_ilut_select_kernel.cu => par_ilut_select_kernels.cu} (98%) rename cuda/factorization/{par_ilut_spgeam_kernel.cu => par_ilut_spgeam_kernels.cu} (97%) rename cuda/factorization/{par_ilut_sweep_kernel.cu => par_ilut_sweep_kernels.cu} (97%) rename cuda/preconditioner/{jacobi_advanced_apply_kernel.cu => jacobi_advanced_apply_kernels.cu} (100%) rename cuda/preconditioner/{jacobi_advanced_apply_instantiate.inc.cu => jacobi_advanced_apply_kernels.instantiate.cu} (94%) rename cuda/preconditioner/{jacobi_generate_kernel.cu => jacobi_generate_kernels.cu} (100%) rename cuda/preconditioner/{jacobi_generate_instantiate.inc.cu => jacobi_generate_kernels.instantiate.cu} (94%) rename cuda/preconditioner/{jacobi_simple_apply_kernel.cu => jacobi_simple_apply_kernels.cu} (100%) rename cuda/preconditioner/{jacobi_simple_apply_instantiate.inc.cu => jacobi_simple_apply_kernels.instantiate.cu} (93%) rename hip/factorization/{par_ilut_approx_filter_kernel.hip.cpp => par_ilut_approx_filter_kernels.hip.cpp} (97%) rename hip/factorization/{par_ilut_filter_kernel.hip.cpp => par_ilut_filter_kernels.hip.cpp} (96%) rename hip/factorization/{par_ilut_select_kernel.hip.cpp => par_ilut_select_kernels.hip.cpp} (99%) rename hip/factorization/{par_ilut_spgeam_kernel.hip.cpp => par_ilut_spgeam_kernels.hip.cpp} (98%) rename hip/factorization/{par_ilut_sweep_kernel.hip.cpp => par_ilut_sweep_kernels.hip.cpp} (97%) diff --git a/accessor/cuda_hip_helper.hpp b/accessor/cuda_hip_helper.hpp new file mode 100644 index 00000000000..225fdfe1b15 --- /dev/null +++ b/accessor/cuda_hip_helper.hpp @@ -0,0 +1,38 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_ACCESSOR_CUDA_HIP_HELPER_HPP_ +#define GKO_ACCESSOR_CUDA_HIP_HELPER_HPP_ + + +#include + + +#ifdef GKO_COMPILING_HIP +#include "accessor/hip_helper.hpp" +#else // GKO_COMPILING_CUDA +#include "accessor/cuda_helper.hpp" +#endif + + +namespace gko { +namespace acc { + + +template +GKO_ACC_INLINE auto as_device_range(AccType&& acc) +{ +#ifdef GKO_COMPILING_HIP + return as_hip_range(std::forward(acc)); +#else // GKO_COMPILING_CUDA + return as_cuda_range(std::forward(acc)); +#endif +} + + +} // namespace acc +} // namespace gko + + +#endif // GKO_ACCESSOR_CUDA_HIP_HELPER_HPP_ diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 0aa93a3b141..9f7079f60a3 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -160,7 +160,7 @@ endfunction(ginkgo_create_cuda_test) ## Internal function allowing separate test name, filename and target name function(ginkgo_create_cuda_test_internal test_name filename test_target_name) add_executable(${test_target_name} ${filename}) - target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_CUDA) + target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda) if(MSVC) target_compile_options(${test_target_name} PRIVATE @@ -188,7 +188,7 @@ endfunction(ginkgo_create_hip_test) function(ginkgo_create_hip_test_internal test_name filename test_target_name) set_source_files_properties(${filename} PROPERTIES LANGUAGE HIP) add_executable(${test_target_name} ${filename}) - target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_HIP) + target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip) ginkgo_set_test_target_properties(${test_target_name} "_hip" ${ARGN}) ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE hipgpu) endfunction(ginkgo_create_hip_test_internal) @@ -203,7 +203,7 @@ endfunction() function(ginkgo_create_omp_test_internal test_name filename test_target_name) ginkgo_build_test_name(${test_name} test_target_name) add_executable(${test_target_name} ${test_name}.cpp) - target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_OMP) + target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_OMP GKO_DEVICE_NAMESPACE=omp) target_link_libraries(${test_target_name} PRIVATE OpenMP::OpenMP_CXX) ginkgo_set_test_target_properties(${test_target_name} "_omp" ${ARGN}) ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE cpu) @@ -253,7 +253,7 @@ function(ginkgo_create_common_test_internal test_name exec_type exec) target_link_libraries(${test_target_name} PRIVATE OpenMP::OpenMP_CXX) endif () - target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=${exec_type} EXEC_NAMESPACE=${exec} GKO_COMPILING_${exec_upper}) + target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=${exec_type} GKO_DEVICE_NAMESPACE=${exec} GKO_COMPILING_${exec_upper}) target_link_libraries(${test_target_name} PRIVATE ${common_test_ADDITIONAL_LIBRARIES}) # use float for DPC++ if necessary if((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE) @@ -285,13 +285,13 @@ function(ginkgo_create_common_device_test test_name) # need to make a separate file for this, since we can't set conflicting properties on the same file configure_file(${test_name}.cpp ${test_name}.cu COPYONLY) ginkgo_create_cuda_test_internal(${test_name}_cuda ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.cu ${test_target_name}_cuda ${ARGN}) - target_compile_definitions(${test_target_name}_cuda PRIVATE EXEC_TYPE=CudaExecutor EXEC_NAMESPACE=cuda) + target_compile_definitions(${test_target_name}_cuda PRIVATE EXEC_TYPE=CudaExecutor GKO_DEVICE_NAMESPACE=cuda) endif() if(GINKGO_BUILD_HIP) # need to make a separate file for this, since we can't set conflicting properties on the same file configure_file(${test_name}.cpp ${test_name}.hip.cpp COPYONLY) ginkgo_create_hip_test_internal(${test_name}_hip ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.hip.cpp ${test_target_name}_hip ${ARGN}) - target_compile_definitions(${test_target_name}_hip PRIVATE EXEC_TYPE=HipExecutor EXEC_NAMESPACE=hip) + target_compile_definitions(${test_target_name}_hip PRIVATE EXEC_TYPE=HipExecutor GKO_DEVICE_NAMESPACE=hip) endif() endfunction(ginkgo_create_common_device_test) diff --git a/common/cuda_hip/base/blas_bindings.hpp b/common/cuda_hip/base/blas_bindings.hpp new file mode 100644 index 00000000000..1708fb88ce1 --- /dev/null +++ b/common/cuda_hip/base/blas_bindings.hpp @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_BASE_BLAS_BINDINGS_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_BLAS_BINDINGS_HPP_ + + +#ifdef GKO_COMPILING_HIP +#include "hip/base/hipblas_bindings.hip.hpp" +#else // GKO_COMPILING_CUDA +#include "cuda/base/cublas_bindings.hpp" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_BASE_BLAS_BINDINGS_HPP_ diff --git a/common/cuda_hip/base/config.hpp b/common/cuda_hip/base/config.hpp new file mode 100644 index 00000000000..d2085ae946b --- /dev/null +++ b/common/cuda_hip/base/config.hpp @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_BASE_CONFIG_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_CONFIG_HPP_ + + +#ifdef GKO_COMPILING_HIP +#include "hip/base/config.hip.hpp" +#else // GKO_COMPILING_CUDA +#include "cuda/base/config.hpp" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_BASE_CONFIG_HPP_ diff --git a/common/cuda_hip/base/pointer_mode_guard.hpp b/common/cuda_hip/base/pointer_mode_guard.hpp new file mode 100644 index 00000000000..41ff6242e49 --- /dev/null +++ b/common/cuda_hip/base/pointer_mode_guard.hpp @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_BASE_POINTER_MODE_GUARD_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_POINTER_MODE_GUARD_HPP_ + + +#ifdef GKO_COMPILING_HIP +#include "hip/base/pointer_mode_guard.hip.hpp" +#else // GKO_COMPILING_CUDA +#include "cuda/base/pointer_mode_guard.hpp" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_BASE_POINTER_MODE_GUARD_HPP_ diff --git a/common/cuda_hip/base/randlib_bindings.hpp b/common/cuda_hip/base/randlib_bindings.hpp new file mode 100644 index 00000000000..249489b0e68 --- /dev/null +++ b/common/cuda_hip/base/randlib_bindings.hpp @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_BASE_RANDLIB_BINDINGS_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_RANDLIB_BINDINGS_HPP_ + + +#ifdef GKO_COMPILING_HIP +#include "hip/base/hiprand_bindings.hip.hpp" +#else // GKO_COMPILING_CUDA +#include "cuda/base/curand_bindings.hpp" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_BASE_RANDLIB_BINDINGS_HPP_ diff --git a/common/cuda_hip/base/runtime.hpp b/common/cuda_hip/base/runtime.hpp new file mode 100644 index 00000000000..ccddfdd2661 --- /dev/null +++ b/common/cuda_hip/base/runtime.hpp @@ -0,0 +1,14 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_BASE_RUNTIME_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_RUNTIME_HPP_ + + +#ifdef GKO_COMPILING_HIP +#include +#endif + + +#endif // GKO_COMMON_CUDA_HIP_BASE_RUNTIME_HPP_ diff --git a/common/cuda_hip/base/sparselib_bindings.hpp b/common/cuda_hip/base/sparselib_bindings.hpp new file mode 100644 index 00000000000..bc565f9190a --- /dev/null +++ b/common/cuda_hip/base/sparselib_bindings.hpp @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BINDINGS_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BINDINGS_HPP_ + + +#ifdef GKO_COMPILING_HIP +#include "hip/base/hipsparse_bindings.hip.hpp" +#else // GKO_COMPILING_CUDA +#include "cuda/base/cusparse_bindings.hpp" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BINDINGS_HPP_ diff --git a/common/cuda_hip/base/thrust.hpp b/common/cuda_hip/base/thrust.hpp new file mode 100644 index 00000000000..f2015d6d544 --- /dev/null +++ b/common/cuda_hip/base/thrust.hpp @@ -0,0 +1,51 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_BASE_THRUST_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_THRUST_HPP_ + + +#include + + +#include +#include + + +#if defined(GKO_COMPILING_CUDA) || \ + (defined(GKO_COMPILING_HIP) && !GINKGO_HIP_PLATFORM_HCC) +#include +#else +#include +#endif + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + +#ifdef GKO_COMPILING_CUDA +inline auto thrust_policy(std::shared_ptr exec) +{ + return thrust::cuda::par.on(exec->get_stream()); +} +#else +inline auto thrust_policy(std::shared_ptr exec) +{ +#if GINKGO_HIP_PLATFORM_HCC + return thrust::hip::par.on(exec->get_stream()); +#else + return thrust::cuda::par.on(exec->get_stream()); +#endif +} +#endif + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_BASE_THRUST_HPP_ diff --git a/common/cuda_hip/base/types.hpp b/common/cuda_hip/base/types.hpp new file mode 100644 index 00000000000..213664d3a4d --- /dev/null +++ b/common/cuda_hip/base/types.hpp @@ -0,0 +1,9 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifdef GKO_COMPILING_CUDA +#include "cuda/base/types.hpp" +#else +#include "hip/base/types.hip.hpp" +#endif diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp.inc index 3d76cfdcb79..60eaf5a9dd9 100644 --- a/common/cuda_hip/components/atomic.hpp.inc +++ b/common/cuda_hip/components/atomic.hpp.inc @@ -196,3 +196,35 @@ GKO_BIND_ATOMIC_MAX(unsigned long long int); #undef GKO_BIND_ATOMIC_MAX + + +/** + * @internal + * + * @note It is not 'real' complex atomic add operation + */ +__forceinline__ __device__ thrust::complex atomic_add( + thrust::complex* __restrict__ address, thrust::complex val) +{ + auto addr = reinterpret_cast(address); + // Separate to real part and imag part + auto real = atomic_add(addr, val.real()); + auto imag = atomic_add(addr + 1, val.imag()); + return {real, imag}; +} + + +/** + * @internal + * + * @note It is not 'real' complex atomic add operation + */ +__forceinline__ __device__ thrust::complex atomic_add( + thrust::complex* __restrict__ address, thrust::complex val) +{ + auto addr = reinterpret_cast(address); + // Separate to real part and imag part + auto real = atomic_add(addr, val.real()); + auto imag = atomic_add(addr + 1, val.imag()); + return {real, imag}; +} diff --git a/common/cuda_hip/components/cooperative_groups.hpp b/common/cuda_hip/components/cooperative_groups.hpp new file mode 100644 index 00000000000..b1f17842302 --- /dev/null +++ b/common/cuda_hip/components/cooperative_groups.hpp @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_COOPERATIVE_GROUPS_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_COOPERATIVE_GROUPS_HPP_ + + +#ifdef GKO_COMPILING_HIP +#include "hip/components/cooperative_groups.hip.hpp" +#else // GKO_COMPILING_CUDA +#include "cuda/components/cooperative_groups.cuh" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_COOPERATIVE_GROUPS_HPP_ diff --git a/common/cuda_hip/components/format_conversion.hpp b/common/cuda_hip/components/format_conversion.hpp new file mode 100644 index 00000000000..a16d09b2e3a --- /dev/null +++ b/common/cuda_hip/components/format_conversion.hpp @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_FORMAT_CONVERSION_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_FORMAT_CONVERSION_HPP_ + + +#ifdef GKO_COMPILING_HIP +#include "hip/components/format_conversion.hip.hpp" +#else // GKO_COMPILING_CUDA +#include "cuda/components/format_conversion.cuh" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_FORMAT_CONVERSION_HPP_ diff --git a/common/cuda_hip/components/memory.hpp b/common/cuda_hip/components/memory.hpp new file mode 100644 index 00000000000..974431e2fb8 --- /dev/null +++ b/common/cuda_hip/components/memory.hpp @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_MEMORY_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_MEMORY_HPP_ + + +#ifdef GKO_COMPILING_HIP +#include "hip/components/memory.hip.hpp" +#else // GKO_COMPILING_CUDA +#include "cuda/components/memory.cuh" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_MEMORY_HPP_ diff --git a/common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc b/common/cuda_hip/factorization/par_ict_kernels.hpp.inc similarity index 75% rename from common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc rename to common/cuda_hip/factorization/par_ict_kernels.hpp.inc index 93a49e56d21..87aa8297345 100644 --- a/common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ict_kernels.hpp.inc @@ -206,4 +206,72 @@ __global__ __launch_bounds__(default_block_size) void ict_tri_spgeam_init( } +template +__global__ __launch_bounds__(default_block_size) void ict_sweep( + const IndexType* __restrict__ a_row_ptrs, + const IndexType* __restrict__ a_col_idxs, + const ValueType* __restrict__ a_vals, + const IndexType* __restrict__ l_row_ptrs, + const IndexType* __restrict__ l_row_idxs, + const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals, + IndexType l_nnz) +{ + auto l_nz = thread::get_subwarp_id_flat(); + if (l_nz >= l_nnz) { + return; + } + auto row = l_row_idxs[l_nz]; + auto col = l_col_idxs[l_nz]; + auto subwarp = + group::tiled_partition(group::this_thread_block()); + // find entry of A at (row, col) + auto a_row_begin = a_row_ptrs[row]; + auto a_row_end = a_row_ptrs[row + 1]; + auto a_row_size = a_row_end - a_row_begin; + auto a_idx = + group_wide_search(a_row_begin, a_row_size, subwarp, + [&](IndexType i) { return a_col_idxs[i] >= col; }); + bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col; + auto a_val = has_a ? a_vals[a_idx] : zero(); + auto l_row_begin = l_row_ptrs[row]; + auto l_row_size = l_row_ptrs[row + 1] - l_row_begin; + auto lh_col_begin = l_row_ptrs[col]; + auto lh_col_size = l_row_ptrs[col + 1] - lh_col_begin; + ValueType sum{}; + IndexType lh_nz{}; + auto last_entry = col; + group_merge( + l_col_idxs + l_row_begin, l_row_size, l_col_idxs + lh_col_begin, + lh_col_size, subwarp, + [&](IndexType l_idx, IndexType l_col, IndexType lh_idx, + IndexType lh_row, IndexType, bool) { + // we don't need to use the `bool valid` because last_entry is + // already a smaller sentinel value than the one used in group_merge + if (l_col == lh_row && l_col < last_entry) { + sum += load_relaxed(l_vals + (l_idx + l_row_begin)) * + conj(load_relaxed(l_vals + (lh_idx + lh_col_begin))); + } + // remember the transposed element + auto found_transp = subwarp.ballot(lh_row == row); + if (found_transp) { + lh_nz = + subwarp.shfl(lh_idx + lh_col_begin, ffs(found_transp) - 1); + } + return true; + }); + // accumulate result from all threads + sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; }); + + if (subwarp.thread_rank() == 0) { + auto to_write = + row == col ? sqrt(a_val - sum) + : (a_val - sum) / + load_relaxed(l_vals + (l_row_ptrs[col + 1] - 1)); + if (is_finite(to_write)) { + store_relaxed(l_vals + l_nz, to_write); + } + } +} + + } // namespace kernel diff --git a/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc b/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc deleted file mode 100644 index bc58f0a9799..00000000000 --- a/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc +++ /dev/null @@ -1,76 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void ict_sweep( - const IndexType* __restrict__ a_row_ptrs, - const IndexType* __restrict__ a_col_idxs, - const ValueType* __restrict__ a_vals, - const IndexType* __restrict__ l_row_ptrs, - const IndexType* __restrict__ l_row_idxs, - const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals, - IndexType l_nnz) -{ - auto l_nz = thread::get_subwarp_id_flat(); - if (l_nz >= l_nnz) { - return; - } - auto row = l_row_idxs[l_nz]; - auto col = l_col_idxs[l_nz]; - auto subwarp = - group::tiled_partition(group::this_thread_block()); - // find entry of A at (row, col) - auto a_row_begin = a_row_ptrs[row]; - auto a_row_end = a_row_ptrs[row + 1]; - auto a_row_size = a_row_end - a_row_begin; - auto a_idx = - group_wide_search(a_row_begin, a_row_size, subwarp, - [&](IndexType i) { return a_col_idxs[i] >= col; }); - bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col; - auto a_val = has_a ? a_vals[a_idx] : zero(); - auto l_row_begin = l_row_ptrs[row]; - auto l_row_size = l_row_ptrs[row + 1] - l_row_begin; - auto lh_col_begin = l_row_ptrs[col]; - auto lh_col_size = l_row_ptrs[col + 1] - lh_col_begin; - ValueType sum{}; - IndexType lh_nz{}; - auto last_entry = col; - group_merge( - l_col_idxs + l_row_begin, l_row_size, l_col_idxs + lh_col_begin, - lh_col_size, subwarp, - [&](IndexType l_idx, IndexType l_col, IndexType lh_idx, - IndexType lh_row, IndexType, bool) { - // we don't need to use the `bool valid` because last_entry is - // already a smaller sentinel value than the one used in group_merge - if (l_col == lh_row && l_col < last_entry) { - sum += load_relaxed(l_vals + (l_idx + l_row_begin)) * - conj(load_relaxed(l_vals + (lh_idx + lh_col_begin))); - } - // remember the transposed element - auto found_transp = subwarp.ballot(lh_row == row); - if (found_transp) { - lh_nz = - subwarp.shfl(lh_idx + lh_col_begin, ffs(found_transp) - 1); - } - return true; - }); - // accumulate result from all threads - sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; }); - - if (subwarp.thread_rank() == 0) { - auto to_write = - row == col ? sqrt(a_val - sum) - : (a_val - sum) / - load_relaxed(l_vals + (l_row_ptrs[col + 1] - 1)); - if (is_finite(to_write)) { - store_relaxed(l_vals + l_nz, to_write); - } - } -} - - -} // namespace kernel diff --git a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc similarity index 100% rename from common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc rename to common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc b/common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc similarity index 100% rename from common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc rename to common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc diff --git a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc similarity index 100% rename from common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc rename to common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp index b32572546f0..5ca25ecb1e3 100644 --- a/common/unified/base/kernel_launch.hpp +++ b/common/unified/base/kernel_launch.hpp @@ -19,7 +19,7 @@ #define GKO_DEVICE_NAMESPACE cuda #define GKO_KERNEL __device__ -#include "cuda/base/types.hpp" +#include "common/cuda_hip/base/types.hpp" namespace gko { @@ -46,7 +46,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type unpack_member(T value) #define GKO_DEVICE_NAMESPACE hip #define GKO_KERNEL __device__ -#include "hip/base/types.hip.hpp" +#include "common/cuda_hip/base/types.hpp" namespace gko { diff --git a/core/test/gtest/CMakeLists.txt b/core/test/gtest/CMakeLists.txt index 56f83181375..f500ddb6ae5 100644 --- a/core/test/gtest/CMakeLists.txt +++ b/core/test/gtest/CMakeLists.txt @@ -25,14 +25,14 @@ if (GINKGO_BUILD_MPI) add_library(ginkgo_gtest_main_mpi_cpu ALIAS ginkgo_gtest_main_mpi) endif() if (GINKGO_BUILD_OMP) - add_gtest_main("_omp" "GKO_COMPILING_OMP") + add_gtest_main("_omp" "GKO_COMPILING_OMP;GKO_DEVICE_NAMESPACE=omp") endif() if (GINKGO_BUILD_CUDA) - add_gtest_main("_cuda" "GKO_COMPILING_CUDA") + add_gtest_main("_cuda" "GKO_COMPILING_CUDA;GKO_DEVICE_NAMESPACE=cuda") endif() if (GINKGO_BUILD_HIP) - add_gtest_main("_hip" "GKO_COMPILING_HIP") + add_gtest_main("_hip" "GKO_COMPILING_HIP;GKO_DEVICE_NAMESPACE=hip") endif() if (GINKGO_BUILD_SYCL) - add_gtest_main("_dpcpp" "GKO_COMPILING_DPCPP") + add_gtest_main("_dpcpp" "GKO_COMPILING_DPCPP;GKO_DEVICE_NAMESPACE=dpcpp") endif() diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index bd214691a2e..88ae83e9005 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -120,7 +120,7 @@ if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") endif() ginkgo_compile_features(ginkgo_cuda) -target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA) +target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda) # include path for generated headers like jacobi_common.hpp target_include_directories(ginkgo_cuda @@ -133,7 +133,7 @@ ginkgo_default_includes(ginkgo_cuda) ginkgo_install_library(ginkgo_cuda) if (GINKGO_CHECK_CIRCULAR_DEPS) - ginkgo_check_headers(ginkgo_cuda GKO_COMPILING_CUDA) + ginkgo_check_headers(ginkgo_cuda "GKO_COMPILING_CUDA;GKO_DEVICE_NAMESPACE=cuda") endif() if(GINKGO_BUILD_TESTS) diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu index 5bc899c11ed..dcaafd5a46c 100644 --- a/cuda/base/batch_multi_vector_kernels.cu +++ b/cuda/base/batch_multi_vector_kernels.cu @@ -13,13 +13,14 @@ #include +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/base/pointer_mode_guard.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" @@ -39,6 +40,7 @@ namespace batch_multi_vector { constexpr auto default_block_size = 256; constexpr int sm_oversubscription = 4; + // clang-format off // NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp index 7c968ec2c6e..5251c594d42 100644 --- a/cuda/base/batch_struct.hpp +++ b/cuda/base/batch_struct.hpp @@ -10,9 +10,9 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" #include "core/base/batch_struct.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/types.hpp" namespace gko { diff --git a/cuda/base/cublas_bindings.hpp b/cuda/base/cublas_bindings.hpp index 485249b7665..c1cdf1f996e 100644 --- a/cuda/base/cublas_bindings.hpp +++ b/cuda/base/cublas_bindings.hpp @@ -12,8 +12,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" namespace gko { @@ -249,6 +249,20 @@ inline void destroy(cublasHandle_t handle) } // namespace cublas + + +namespace blas { + + +using namespace cublas; + + +#define BLAS_OP_N CUBLAS_OP_N +#define BLAS_OP_T CUBLAS_OP_T +#define BLAS_OP_C CUBLAS_OP_C + + +} // namespace blas } // namespace cuda } // namespace kernels } // namespace gko diff --git a/cuda/base/curand_bindings.hpp b/cuda/base/curand_bindings.hpp index b0ae52c5f00..10e09f4a356 100644 --- a/cuda/base/curand_bindings.hpp +++ b/cuda/base/curand_bindings.hpp @@ -12,8 +12,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" namespace gko { @@ -85,6 +85,18 @@ GKO_BIND_CURAND_RANDOM_VECTOR(std::complex, curandGenerateNormalDouble); } // namespace curand + + +namespace randlib { + + +using namespace curand; + + +#define RANDLIB_RNG_PSEUDO_DEFAULT CURAND_RNG_PSEUDO_DEFAULT + + +} // namespace randlib } // namespace cuda } // namespace kernels } // namespace gko diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp index 87737e8865e..06aaf0c6f1d 100644 --- a/cuda/base/cusparse_bindings.hpp +++ b/cuda/base/cusparse_bindings.hpp @@ -13,7 +13,7 @@ #include -#include "cuda/base/types.hpp" +#include "common/cuda_hip/base/types.hpp" namespace gko { @@ -948,7 +948,7 @@ inline csrilu02Info_t create_ilu0_info() } -inline void destroy(csrilu02Info_t info) +inline void destroy_ilu0_info(csrilu02Info_t info) { GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyCsrilu02Info(info)); } @@ -962,7 +962,7 @@ inline csric02Info_t create_ic0_info() } -inline void destroy(csric02Info_t info) +inline void destroy_ic0_info(csric02Info_t info) { GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyCsric02Info(info)); } @@ -1463,6 +1463,20 @@ GKO_BIND_CUSPARSE_IC0(std::complex, cusparseZcsric02); } // namespace cusparse + + +namespace sparselib { + + +using namespace cusparse; + + +#define SPARSELIB_OPERATION_TRANSPOSE CUSPARSE_OPERATION_TRANSPOSE +#define SPARSELIB_OPERATION_NON_TRANSPOSE CUSPARSE_OPERATION_NON_TRANSPOSE +#define SPARSELIB_SOLVE_POLICY_USE_LEVEL CUSPARSE_SOLVE_POLICY_USE_LEVEL + + +} // namespace sparselib } // namespace cuda } // namespace kernels } // namespace gko diff --git a/cuda/base/cusparse_block_bindings.hpp b/cuda/base/cusparse_block_bindings.hpp index eddf249a22b..fc64c19796c 100644 --- a/cuda/base/cusparse_block_bindings.hpp +++ b/cuda/base/cusparse_block_bindings.hpp @@ -13,8 +13,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/cusparse_bindings.hpp" -#include "cuda/base/types.hpp" namespace gko { diff --git a/cuda/base/device_matrix_data_kernels.cu b/cuda/base/device_matrix_data_kernels.cu index ed5601f57a5..554abe8bc37 100644 --- a/cuda/base/device_matrix_data_kernels.cu +++ b/cuda/base/device_matrix_data_kernels.cu @@ -14,8 +14,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" namespace gko { diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index 52a92132689..3d1dbf7c92c 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -20,7 +20,7 @@ #include -#include "cuda/base/config.hpp" +#include "common/cuda_hip/base/config.hpp" #include "cuda/base/cublas_bindings.hpp" #include "cuda/base/cusparse_handle.hpp" #include "cuda/base/scoped_device_id.hpp" diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh index ec8d31ba747..0d4bc4eebd5 100644 --- a/cuda/base/kernel_launch.cuh +++ b/cuda/base/kernel_launch.cuh @@ -11,8 +11,9 @@ #include -#include "accessor/cuda_helper.hpp" -#include "cuda/base/types.hpp" +#include "accessor/cuda_hip_helper.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" #include "cuda/components/thread_ids.cuh" @@ -23,21 +24,21 @@ namespace cuda { template struct to_device_type_impl&> { - using type = std::decay_t>()))>; static type map_to_device(gko::acc::range& range) { - return gko::acc::as_cuda_range(range); + return gko::acc::as_device_range(range); } }; template struct to_device_type_impl&> { - using type = std::decay_t>()))>; static type map_to_device(const gko::acc::range& range) { - return gko::acc::as_cuda_range(range); + return gko::acc::as_device_range(range); } }; diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh index 6146d7248d0..817d19006bc 100644 --- a/cuda/base/kernel_launch_reduction.cuh +++ b/cuda/base/kernel_launch_reduction.cuh @@ -8,9 +8,9 @@ #endif +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/base/kernel_launch_solver.cuh b/cuda/base/kernel_launch_solver.cuh index 17988755517..0d9eaeb2653 100644 --- a/cuda/base/kernel_launch_solver.cuh +++ b/cuda/base/kernel_launch_solver.cuh @@ -8,6 +8,9 @@ #endif +#include "common/cuda_hip/base/runtime.hpp" + + namespace gko { namespace kernels { namespace cuda { diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 88e9eb17a35..510d7cef889 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -394,6 +394,10 @@ GKO_INLINE GKO_ATTRIBUTES constexpr } +using gpuComplex = cuComplex; +using gpuDoubleComplex = cuDoubleComplex; + + } // namespace cuda } // namespace kernels } // namespace gko diff --git a/cuda/components/atomic.cuh b/cuda/components/atomic.cuh index 6dbed0b0d25..1964f0ae196 100644 --- a/cuda/components/atomic.cuh +++ b/cuda/components/atomic.cuh @@ -9,8 +9,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" namespace gko { @@ -21,38 +21,6 @@ namespace cuda { #include "common/cuda_hip/components/atomic.hpp.inc" -/** - * @internal - * - * @note It is not 'real' complex atomic add operation - */ -__forceinline__ __device__ thrust::complex atomic_add( - thrust::complex* __restrict__ address, thrust::complex val) -{ - cuComplex* addr = reinterpret_cast(address); - // Separate to real part and imag part - auto real = atomic_add(&(addr->x), val.real()); - auto imag = atomic_add(&(addr->y), val.imag()); - return {real, imag}; -} - - -/** - * @internal - * - * @note It is not 'real' complex atomic add operation - */ -__forceinline__ __device__ thrust::complex atomic_add( - thrust::complex* __restrict__ address, thrust::complex val) -{ - cuDoubleComplex* addr = reinterpret_cast(address); - // Separate to real part and imag part - auto real = atomic_add(&(addr->x), val.real()); - auto imag = atomic_add(&(addr->y), val.imag()); - return {real, imag}; -} - - } // namespace cuda } // namespace kernels } // namespace gko diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh index eae0c957f21..70643a3b16a 100644 --- a/cuda/components/cooperative_groups.cuh +++ b/cuda/components/cooperative_groups.cuh @@ -13,7 +13,7 @@ #include -#include "cuda/base/config.hpp" +#include "common/cuda_hip/base/config.hpp" namespace gko { diff --git a/cuda/components/diagonal_block_manipulation.cuh b/cuda/components/diagonal_block_manipulation.cuh index d748fcab2e5..a8f27d3a81f 100644 --- a/cuda/components/diagonal_block_manipulation.cuh +++ b/cuda/components/diagonal_block_manipulation.cuh @@ -9,9 +9,9 @@ #include -#include "cuda/base/config.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" namespace gko { diff --git a/cuda/components/format_conversion.cuh b/cuda/components/format_conversion.cuh index bccc927c9cd..f0ef007c53c 100644 --- a/cuda/components/format_conversion.cuh +++ b/cuda/components/format_conversion.cuh @@ -10,7 +10,7 @@ #include -#include "cuda/components/cooperative_groups.cuh" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh index 22bedca9699..97e5d67c23a 100644 --- a/cuda/components/memory.cuh +++ b/cuda/components/memory.cuh @@ -12,7 +12,7 @@ #include -#include "cuda/base/types.hpp" +#include "common/cuda_hip/base/types.hpp" namespace gko { diff --git a/cuda/components/prefix_sum.cuh b/cuda/components/prefix_sum.cuh index 653de4e9e15..2f6f145e304 100644 --- a/cuda/components/prefix_sum.cuh +++ b/cuda/components/prefix_sum.cuh @@ -9,8 +9,8 @@ #include -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/components/cooperative_groups.cuh" +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh index ded80fae40a..250c560d44b 100644 --- a/cuda/components/reduction.cuh +++ b/cuda/components/reduction.cuh @@ -13,10 +13,11 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/array_access.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" diff --git a/cuda/components/searching.cuh b/cuda/components/searching.cuh index 1dc1304a82a..5472ac46ed1 100644 --- a/cuda/components/searching.cuh +++ b/cuda/components/searching.cuh @@ -6,7 +6,7 @@ #define GKO_CUDA_COMPONENTS_SEARCHING_CUH_ -#include "cuda/base/config.hpp" +#include "common/cuda_hip/base/config.hpp" #include "cuda/components/intrinsics.cuh" diff --git a/cuda/components/segment_scan.cuh b/cuda/components/segment_scan.cuh index 842f1e06760..6ffb8028334 100644 --- a/cuda/components/segment_scan.cuh +++ b/cuda/components/segment_scan.cuh @@ -6,7 +6,7 @@ #define GKO_CUDA_COMPONENTS_SEGMENT_SCAN_CUH_ -#include "cuda/components/cooperative_groups.cuh" +#include "common/cuda_hip/components/cooperative_groups.hpp" namespace gko { diff --git a/cuda/components/sorting.cuh b/cuda/components/sorting.cuh index e6eb17ec8e4..59e44d1bb82 100644 --- a/cuda/components/sorting.cuh +++ b/cuda/components/sorting.cuh @@ -6,8 +6,8 @@ #define GKO_CUDA_COMPONENTS_SORTING_CUH_ -#include "cuda/base/config.hpp" -#include "cuda/components/cooperative_groups.cuh" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" namespace gko { diff --git a/cuda/components/syncfree.cuh b/cuda/components/syncfree.cuh index 0d45c8db516..0d5c0d11f43 100644 --- a/cuda/components/syncfree.cuh +++ b/cuda/components/syncfree.cuh @@ -9,11 +9,11 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "core/components/fill_array_kernels.hpp" -#include "cuda/base/config.hpp" #include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/memory.cuh" namespace gko { diff --git a/cuda/components/thread_ids.cuh b/cuda/components/thread_ids.cuh index c3e517e0f9d..1113ea75fc6 100644 --- a/cuda/components/thread_ids.cuh +++ b/cuda/components/thread_ids.cuh @@ -6,17 +6,12 @@ #define GKO_CUDA_COMPONENTS_THREAD_IDS_CUH_ -#include "cuda/base/config.hpp" +#include "common/cuda_hip/base/config.hpp" namespace gko { namespace kernels { namespace cuda { -/** - * @brief The CUDA thread namespace. - * - * @ingroup cuda_thread - */ namespace thread { diff --git a/cuda/distributed/vector_kernels.cu b/cuda/distributed/vector_kernels.cu index ca9c419239b..7b06ada9f0e 100644 --- a/cuda/distributed/vector_kernels.cu +++ b/cuda/distributed/vector_kernels.cu @@ -5,6 +5,9 @@ #include "core/distributed/vector_kernels.hpp" +#include + + #include #include #include diff --git a/cuda/factorization/cholesky_kernels.cu b/cuda/factorization/cholesky_kernels.cu index 79779f2f54b..e05b0803dc2 100644 --- a/cuda/factorization/cholesky_kernels.cu +++ b/cuda/factorization/cholesky_kernels.cu @@ -20,15 +20,15 @@ #include +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" #include "core/factorization/elimination_forest.hpp" #include "core/factorization/lu_kernels.hpp" #include "core/matrix/csr_lookup.hpp" -#include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/syncfree.cuh" @@ -80,19 +80,19 @@ void symbolic_count(std::shared_ptr exec, } // sort postorder_cols inside rows { - const auto handle = exec->get_cusparse_handle(); - auto descr = cusparse::create_mat_descr(); + const auto handle = exec->get_sparselib_handle(); + auto descr = sparselib::create_mat_descr(); array permutation_array(exec, mtx_nnz); auto permutation = permutation_array.get_data(); components::fill_seq_array(exec, permutation, mtx_nnz); size_type buffer_size{}; - cusparse::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz, - row_ptrs, postorder_cols, buffer_size); + sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz, + row_ptrs, postorder_cols, buffer_size); array buffer_array{exec, buffer_size}; auto buffer = buffer_array.get_data(); - cusparse::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs, - postorder_cols, permutation, buffer); - cusparse::destroy(descr); + sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs, + postorder_cols, permutation, buffer); + sparselib::destroy(descr); } // count nonzeros per row of L { diff --git a/cuda/factorization/factorization_kernels.cu b/cuda/factorization/factorization_kernels.cu index 4ea03981a15..309ded37d34 100644 --- a/cuda/factorization/factorization_kernels.cu +++ b/cuda/factorization/factorization_kernels.cu @@ -8,12 +8,13 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/array_access.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/searching.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/factorization/ic_kernels.cu b/cuda/factorization/ic_kernels.cu index 1afb10ce57a..9d55856f139 100644 --- a/cuda/factorization/ic_kernels.cu +++ b/cuda/factorization/ic_kernels.cu @@ -8,7 +8,7 @@ #include -#include "cuda/base/cusparse_bindings.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" namespace gko { @@ -27,37 +27,37 @@ void compute(std::shared_ptr exec, matrix::Csr* m) { const auto id = exec->get_device_id(); - auto handle = exec->get_cusparse_handle(); - auto desc = cusparse::create_mat_descr(); - auto info = cusparse::create_ic0_info(); + auto handle = exec->get_sparselib_handle(); + auto desc = sparselib::create_mat_descr(); + auto info = sparselib::create_ic0_info(); // get buffer size for IC IndexType num_rows = m->get_size()[0]; IndexType nnz = m->get_num_stored_elements(); size_type buffer_size{}; - cusparse::ic0_buffer_size(handle, num_rows, nnz, desc, - m->get_const_values(), m->get_const_row_ptrs(), - m->get_const_col_idxs(), info, buffer_size); + sparselib::ic0_buffer_size(handle, num_rows, nnz, desc, + m->get_const_values(), m->get_const_row_ptrs(), + m->get_const_col_idxs(), info, buffer_size); array buffer{exec, buffer_size}; // set up IC(0) - cusparse::ic0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), - m->get_const_row_ptrs(), m->get_const_col_idxs(), - info, CUSPARSE_SOLVE_POLICY_USE_LEVEL, - buffer.get_data()); + sparselib::ic0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), + m->get_const_row_ptrs(), m->get_const_col_idxs(), + info, SPARSELIB_SOLVE_POLICY_USE_LEVEL, + buffer.get_data()); - cusparse::ic0(handle, num_rows, nnz, desc, m->get_values(), - m->get_const_row_ptrs(), m->get_const_col_idxs(), info, - CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); + sparselib::ic0(handle, num_rows, nnz, desc, m->get_values(), + m->get_const_row_ptrs(), m->get_const_col_idxs(), info, + SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); // CUDA 11.4 has a use-after-free bug on Turing #if (CUDA_VERSION >= 11040) exec->synchronize(); #endif - cusparse::destroy(info); - cusparse::destroy(desc); + sparselib::destroy_ic0_info(info); + sparselib::destroy(desc); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL); diff --git a/cuda/factorization/ilu_kernels.cu b/cuda/factorization/ilu_kernels.cu index 33e59bb54c9..acebec6e94c 100644 --- a/cuda/factorization/ilu_kernels.cu +++ b/cuda/factorization/ilu_kernels.cu @@ -8,7 +8,7 @@ #include -#include "cuda/base/cusparse_bindings.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" namespace gko { @@ -27,37 +27,37 @@ void compute_lu(std::shared_ptr exec, matrix::Csr* m) { const auto id = exec->get_device_id(); - auto handle = exec->get_cusparse_handle(); - auto desc = cusparse::create_mat_descr(); - auto info = cusparse::create_ilu0_info(); + auto handle = exec->get_sparselib_handle(); + auto desc = sparselib::create_mat_descr(); + auto info = sparselib::create_ilu0_info(); // get buffer size for ILU IndexType num_rows = m->get_size()[0]; IndexType nnz = m->get_num_stored_elements(); size_type buffer_size{}; - cusparse::ilu0_buffer_size(handle, num_rows, nnz, desc, - m->get_const_values(), m->get_const_row_ptrs(), - m->get_const_col_idxs(), info, buffer_size); + sparselib::ilu0_buffer_size(handle, num_rows, nnz, desc, + m->get_const_values(), m->get_const_row_ptrs(), + m->get_const_col_idxs(), info, buffer_size); array buffer{exec, buffer_size}; // set up ILU(0) - cusparse::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), - m->get_const_row_ptrs(), m->get_const_col_idxs(), - info, CUSPARSE_SOLVE_POLICY_USE_LEVEL, - buffer.get_data()); + sparselib::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), + m->get_const_row_ptrs(), m->get_const_col_idxs(), + info, SPARSELIB_SOLVE_POLICY_USE_LEVEL, + buffer.get_data()); - cusparse::ilu0(handle, num_rows, nnz, desc, m->get_values(), - m->get_const_row_ptrs(), m->get_const_col_idxs(), info, - CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); + sparselib::ilu0(handle, num_rows, nnz, desc, m->get_values(), + m->get_const_row_ptrs(), m->get_const_col_idxs(), info, + SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); // CUDA 11.4 has a use-after-free bug on Turing #if (CUDA_VERSION >= 11040) exec->synchronize(); #endif - cusparse::destroy(info); - cusparse::destroy(desc); + sparselib::destroy_ilu0_info(info); + sparselib::destroy(desc); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/cuda/factorization/lu_kernels.cu b/cuda/factorization/lu_kernels.cu index 583bf51fb67..9c3069f62cf 100644 --- a/cuda/factorization/lu_kernels.cu +++ b/cuda/factorization/lu_kernels.cu @@ -17,11 +17,11 @@ #include +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/allocator.hpp" #include "core/matrix/csr_lookup.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/syncfree.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/factorization/par_ic_kernels.cu b/cuda/factorization/par_ic_kernels.cu index a9de634f1f9..f493cb11fd1 100644 --- a/cuda/factorization/par_ic_kernels.cu +++ b/cuda/factorization/par_ic_kernels.cu @@ -10,9 +10,9 @@ #include +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/memory.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu index 5f48ceef2f8..d958f81d2f4 100644 --- a/cuda/factorization/par_ict_kernels.cu +++ b/cuda/factorization/par_ict_kernels.cu @@ -12,6 +12,8 @@ #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" @@ -19,7 +21,6 @@ #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/math.hpp" #include "cuda/components/intrinsics.cuh" -#include "cuda/components/memory.cuh" #include "cuda/components/merging.cuh" #include "cuda/components/prefix_sum.cuh" #include "cuda/components/reduction.cuh" @@ -46,8 +47,7 @@ using compiled_kernels = syn::value_list; -#include "common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc" -#include "common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc" +#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc" namespace { diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu index 7a770a39353..cd48dd2a9db 100644 --- a/cuda/factorization/par_ilu_kernels.cu +++ b/cuda/factorization/par_ilu_kernels.cu @@ -5,12 +5,14 @@ #include "core/factorization/par_ilu_kernels.hpp" +#include #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/memory.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/factorization/par_ilut_approx_filter_kernel.cu b/cuda/factorization/par_ilut_approx_filter_kernels.cu similarity index 97% rename from cuda/factorization/par_ilut_approx_filter_kernel.cu rename to cuda/factorization/par_ilut_approx_filter_kernels.cu index 853519cd36b..ae544939e17 100644 --- a/cuda/factorization/par_ilut_approx_filter_kernel.cu +++ b/cuda/factorization/par_ilut_approx_filter_kernels.cu @@ -15,16 +15,16 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/prefix_sum.cuh" #include "cuda/components/sorting.cuh" diff --git a/cuda/factorization/par_ilut_filter_kernel.cu b/cuda/factorization/par_ilut_filter_kernels.cu similarity index 96% rename from cuda/factorization/par_ilut_filter_kernel.cu rename to cuda/factorization/par_ilut_filter_kernels.cu index 0e63f102b72..4a24c5f305b 100644 --- a/cuda/factorization/par_ilut_filter_kernel.cu +++ b/cuda/factorization/par_ilut_filter_kernels.cu @@ -12,15 +12,16 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/factorization/par_ilut_select_kernel.cu b/cuda/factorization/par_ilut_select_kernels.cu similarity index 98% rename from cuda/factorization/par_ilut_select_kernel.cu rename to cuda/factorization/par_ilut_select_kernels.cu index ca8b55e504b..6a7bd53c1c4 100644 --- a/cuda/factorization/par_ilut_select_kernel.cu +++ b/cuda/factorization/par_ilut_select_kernels.cu @@ -13,6 +13,7 @@ #include +#include "common/cuda_hip/base/runtime.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "cuda/base/math.hpp" #include "cuda/components/atomic.cuh" @@ -147,7 +148,7 @@ void threshold_select(std::shared_ptr exec, auto out_ptr = reinterpret_cast(tmp1.get_data()); kernel::basecase_select<<<1, kernel::basecase_block_size, 0, exec->get_stream()>>>( - as_cuda_type(tmp22), bucket.size, rank, as_cuda_type(out_ptr)); + as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr)); threshold = exec->copy_val_to_host(out_ptr); } diff --git a/cuda/factorization/par_ilut_spgeam_kernel.cu b/cuda/factorization/par_ilut_spgeam_kernels.cu similarity index 97% rename from cuda/factorization/par_ilut_spgeam_kernel.cu rename to cuda/factorization/par_ilut_spgeam_kernels.cu index c4372f66219..0a751c2f48f 100644 --- a/cuda/factorization/par_ilut_spgeam_kernel.cu +++ b/cuda/factorization/par_ilut_spgeam_kernels.cu @@ -12,13 +12,14 @@ #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/math.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/merging.cuh" #include "cuda/components/prefix_sum.cuh" @@ -80,8 +81,8 @@ void add_candidates(syn::value_list, auto u_vals = u->get_const_values(); auto l_new_row_ptrs = l_new->get_row_ptrs(); auto u_new_row_ptrs = u_new->get_row_ptrs(); - // count non-zeros per row if (num_blocks > 0) { + // count non-zeros per row kernel::tri_spgeam_nnz <<get_stream()>>>( lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs, @@ -105,8 +106,8 @@ void add_candidates(syn::value_list, auto u_new_col_idxs = u_new->get_col_idxs(); auto u_new_vals = u_new->get_values(); - // fill columns and values if (num_blocks > 0) { + // fill columns and values kernel::tri_spgeam_init <<get_stream()>>>( lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs, diff --git a/cuda/factorization/par_ilut_sweep_kernel.cu b/cuda/factorization/par_ilut_sweep_kernels.cu similarity index 97% rename from cuda/factorization/par_ilut_sweep_kernel.cu rename to cuda/factorization/par_ilut_sweep_kernels.cu index 85fb3f26e21..5924ebe328d 100644 --- a/cuda/factorization/par_ilut_sweep_kernel.cu +++ b/cuda/factorization/par_ilut_sweep_kernels.cu @@ -12,6 +12,8 @@ #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" @@ -19,7 +21,6 @@ #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/math.hpp" #include "cuda/components/intrinsics.cuh" -#include "cuda/components/memory.cuh" #include "cuda/components/merging.cuh" #include "cuda/components/prefix_sum.cuh" #include "cuda/components/reduction.cuh" diff --git a/cuda/log/batch_logger.cuh b/cuda/log/batch_logger.cuh index 26c60ae78eb..3e53d6ef0a6 100644 --- a/cuda/log/batch_logger.cuh +++ b/cuda/log/batch_logger.cuh @@ -23,4 +23,5 @@ namespace batch_log { } // namespace kernels } // namespace gko + #endif // GKO_CUDA_LOG_BATCH_LOGGER_CUH_ diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu index 6be0a2cab3b..6ec20480405 100644 --- a/cuda/matrix/batch_csr_kernels.cu +++ b/cuda/matrix/batch_csr_kernels.cu @@ -13,12 +13,13 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu index 56268d8d6b4..673b08e5db1 100644 --- a/cuda/matrix/batch_dense_kernels.cu +++ b/cuda/matrix/batch_dense_kernels.cu @@ -9,15 +9,17 @@ #include +#include #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu index 3c824cf8da4..8f0160bd154 100644 --- a/cuda/matrix/batch_ell_kernels.cu +++ b/cuda/matrix/batch_ell_kernels.cu @@ -13,12 +13,13 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp index 1c17aea3bfe..5e9c803c9f6 100644 --- a/cuda/matrix/batch_struct.hpp +++ b/cuda/matrix/batch_struct.hpp @@ -13,8 +13,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "core/base/batch_struct.hpp" -#include "cuda/base/types.hpp" namespace gko { diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu index 3d67144c9ec..f138d0b934e 100644 --- a/cuda/matrix/coo_kernels.cu +++ b/cuda/matrix/coo_kernels.cu @@ -12,25 +12,21 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" #include "core/matrix/dense_kernels.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/format_conversion.cuh" #include "cuda/components/segment_scan.cuh" #include "cuda/components/thread_ids.cuh" namespace gko { namespace kernels { -/** - * @brief The CUDA namespace. - * - * @ingroup cuda - */ namespace cuda { /** * @brief The Coordinate matrix format namespace. diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu index 4a779775670..73ce267ec65 100644 --- a/cuda/matrix/csr_kernels.template.cu +++ b/cuda/matrix/csr_kernels.template.cu @@ -27,7 +27,13 @@ #include -#include "accessor/cuda_helper.hpp" +#include "accessor/cuda_hip_helper.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" #include "core/base/array_access.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" @@ -38,15 +44,9 @@ #include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/pointer_mode_guard.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/format_conversion.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/merging.cuh" #include "cuda/components/prefix_sum.cuh" @@ -133,10 +133,11 @@ void merge_path_spmv(syn::value_list, kernel::abstract_merge_path_spmv <<get_stream()>>>( static_cast(a->get_size()[0]), - acc::as_cuda_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), as_device_type(a->get_const_srow()), - acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals), + acc::as_device_range(b_vals), + acc::as_device_range(c_vals), as_device_type(row_out.get_data()), as_device_type(val_out.get_data())); } @@ -144,7 +145,7 @@ void merge_path_spmv(syn::value_list, abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>( grid_num, as_device_type(val_out.get_data()), as_device_type(row_out.get_data()), - acc::as_cuda_range(c_vals)); + acc::as_device_range(c_vals)); } else if (alpha != nullptr && beta != nullptr) { if (grid_num > 0) { @@ -152,12 +153,12 @@ void merge_path_spmv(syn::value_list, <<get_stream()>>>( static_cast(a->get_size()[0]), as_device_type(alpha->get_const_values()), - acc::as_cuda_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), as_device_type(a->get_const_srow()), - acc::as_cuda_range(b_vals), + acc::as_device_range(b_vals), as_device_type(beta->get_const_values()), - acc::as_cuda_range(c_vals), + acc::as_device_range(c_vals), as_device_type(row_out.get_data()), as_device_type(val_out.get_data())); } @@ -166,7 +167,7 @@ void merge_path_spmv(syn::value_list, grid_num, as_device_type(val_out.get_data()), as_device_type(row_out.get_data()), as_device_type(alpha->get_const_values()), - acc::as_cuda_range(c_vals)); + acc::as_device_range(c_vals)); } else { GKO_KERNEL_NOT_FOUND; } @@ -245,21 +246,21 @@ void classical_spmv(syn::value_list, if (grid.x > 0 && grid.y > 0) { kernel::abstract_classical_spmv <<get_stream()>>>( - a->get_size()[0], acc::as_cuda_range(a_vals), + a->get_size()[0], acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), - acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals)); + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); } } else if (alpha != nullptr && beta != nullptr) { if (grid.x > 0 && grid.y > 0) { kernel::abstract_classical_spmv <<get_stream()>>>( a->get_size()[0], as_device_type(alpha->get_const_values()), - acc::as_cuda_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), - acc::as_cuda_range(b_vals), + acc::as_device_range(b_vals), as_device_type(beta->get_const_values()), - acc::as_cuda_range(c_vals)); + acc::as_device_range(c_vals)); } } else { GKO_KERNEL_NOT_FOUND; @@ -301,20 +302,20 @@ void load_balance_spmv(std::shared_ptr exec, exec->get_stream()>>>( nwarps, static_cast(a->get_size()[0]), as_device_type(alpha->get_const_values()), - acc::as_cuda_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), as_device_type(a->get_const_srow()), - acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals)); + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); } } else { if (csr_grid.x > 0 && csr_grid.y > 0) { kernel::abstract_spmv<<get_stream()>>>( nwarps, static_cast(a->get_size()[0]), - acc::as_cuda_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), as_device_type(a->get_const_srow()), - acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals)); + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); } } } @@ -329,55 +330,55 @@ bool try_general_sparselib_spmv(std::shared_ptr exec, const ValueType* beta, matrix::Dense* c) { - auto handle = exec->get_cusparse_handle(); + auto handle = exec->get_sparselib_handle(); #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - if (!cusparse::is_supported::value || + if (!sparselib::is_supported::value || b->get_stride() != 1 || c->get_stride() != 1 || b->get_size()[0] == 0 || c->get_size()[0] == 0) { return false; } - auto descr = cusparse::create_mat_descr(); + auto descr = sparselib::create_mat_descr(); auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - cusparse::spmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, a->get_size()[0], - a->get_size()[1], a->get_num_stored_elements(), alpha, descr, - a->get_const_values(), row_ptrs, col_idxs, - b->get_const_values(), beta, c->get_values()); + sparselib::spmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, a->get_size()[0], + a->get_size()[1], a->get_num_stored_elements(), alpha, + descr, a->get_const_values(), row_ptrs, col_idxs, + b->get_const_values(), beta, c->get_values()); - cusparse::destroy(descr); + sparselib::destroy(descr); #else // CUDA_VERSION >= 11000 // workaround for a division by zero in cuSPARSE 11.? if (a->get_size()[1] == 0) { return false; } - cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE; + cusparseOperation_t trans = SPARSELIB_OPERATION_NON_TRANSPOSE; auto row_ptrs = const_cast(a->get_const_row_ptrs()); auto col_idxs = const_cast(a->get_const_col_idxs()); auto values = const_cast(a->get_const_values()); - auto mat = cusparse::create_csr(a->get_size()[0], a->get_size()[1], - a->get_num_stored_elements(), row_ptrs, - col_idxs, values); + auto mat = sparselib::create_csr(a->get_size()[0], a->get_size()[1], + a->get_num_stored_elements(), row_ptrs, + col_idxs, values); auto b_val = const_cast(b->get_const_values()); auto c_val = c->get_values(); if (b->get_stride() == 1 && c->get_stride() == 1) { - auto vecb = cusparse::create_dnvec(b->get_size()[0], b_val); - auto vecc = cusparse::create_dnvec(c->get_size()[0], c_val); + auto vecb = sparselib::create_dnvec(b->get_size()[0], b_val); + auto vecc = sparselib::create_dnvec(c->get_size()[0], c_val); #if CUDA_VERSION >= 11021 constexpr auto alg = CUSPARSE_SPMV_CSR_ALG1; #else constexpr auto alg = CUSPARSE_CSRMV_ALG1; #endif size_type buffer_size = 0; - cusparse::spmv_buffersize(handle, trans, alpha, mat, vecb, - beta, vecc, alg, &buffer_size); + sparselib::spmv_buffersize(handle, trans, alpha, mat, vecb, + beta, vecc, alg, &buffer_size); array buffer_array(exec, buffer_size); auto buffer = buffer_array.get_data(); - cusparse::spmv(handle, trans, alpha, mat, vecb, beta, vecc, - alg, buffer); - cusparse::destroy(vecb); - cusparse::destroy(vecc); + sparselib::spmv(handle, trans, alpha, mat, vecb, beta, vecc, + alg, buffer); + sparselib::destroy(vecb); + sparselib::destroy(vecc); } else { #if CUDA_VERSION >= 11060 if (b->get_size()[1] == 1) { @@ -388,22 +389,22 @@ bool try_general_sparselib_spmv(std::shared_ptr exec, #endif // CUDA_VERSION >= 11060 cusparseSpMMAlg_t alg = CUSPARSE_SPMM_CSR_ALG2; auto vecb = - cusparse::create_dnmat(b->get_size(), b->get_stride(), b_val); + sparselib::create_dnmat(b->get_size(), b->get_stride(), b_val); auto vecc = - cusparse::create_dnmat(c->get_size(), c->get_stride(), c_val); + sparselib::create_dnmat(c->get_size(), c->get_stride(), c_val); size_type buffer_size = 0; - cusparse::spmm_buffersize(handle, trans, trans, alpha, mat, - vecb, beta, vecc, alg, - &buffer_size); + sparselib::spmm_buffersize(handle, trans, trans, alpha, mat, + vecb, beta, vecc, alg, + &buffer_size); array buffer_array(exec, buffer_size); auto buffer = buffer_array.get_data(); - cusparse::spmm(handle, trans, trans, alpha, mat, vecb, beta, - vecc, alg, buffer); - cusparse::destroy(vecb); - cusparse::destroy(vecc); + sparselib::spmm(handle, trans, trans, alpha, mat, vecb, beta, + vecc, alg, buffer); + sparselib::destroy(vecb); + sparselib::destroy(vecc); } - cusparse::destroy(mat); + sparselib::destroy(mat); #endif return true; } @@ -437,8 +438,8 @@ bool try_sparselib_spmv(std::shared_ptr exec, return try_general_sparselib_spmv(exec, alpha->get_const_values(), a, b, beta->get_const_values(), c); } else { - auto handle = exec->get_cusparse_handle(); - cusparse::pointer_mode_guard pm_guard(handle); + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); const auto valpha = one(); const auto vbeta = zero(); return try_general_sparselib_spmv(exec, &valpha, a, b, &vbeta, c); @@ -583,8 +584,8 @@ void spgemm(std::shared_ptr exec, auto b_col_idxs = b->get_const_col_idxs(); auto c_row_ptrs = c->get_row_ptrs(); - auto handle = exec->get_cusparse_handle(); - cusparse::pointer_mode_guard pm_guard(handle); + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); auto alpha = one(); auto a_nnz = static_cast(a->get_num_stored_elements()); @@ -600,18 +601,18 @@ void spgemm(std::shared_ptr exec, auto& c_vals_array = c_builder.get_value_array(); #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - if (!cusparse::is_supported::value) { + if (!sparselib::is_supported::value) { GKO_NOT_IMPLEMENTED; } - auto a_descr = cusparse::create_mat_descr(); - auto b_descr = cusparse::create_mat_descr(); - auto c_descr = cusparse::create_mat_descr(); - auto d_descr = cusparse::create_mat_descr(); - auto info = cusparse::create_spgemm_info(); + auto a_descr = sparselib::create_mat_descr(); + auto b_descr = sparselib::create_mat_descr(); + auto c_descr = sparselib::create_mat_descr(); + auto d_descr = sparselib::create_mat_descr(); + auto info = sparselib::create_spgemm_info(); // allocate buffer size_type buffer_size{}; - cusparse::spgemm_buffer_size( + sparselib::spgemm_buffer_size( handle, m, n, k, &alpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz, null_index, null_index, info, buffer_size); @@ -620,74 +621,75 @@ void spgemm(std::shared_ptr exec, // count nnz IndexType c_nnz{}; - cusparse::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs, - a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, - d_descr, zero_nnz, null_index, null_index, c_descr, - c_row_ptrs, &c_nnz, info, buffer); + sparselib::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs, + a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, + d_descr, zero_nnz, null_index, null_index, c_descr, + c_row_ptrs, &c_nnz, info, buffer); // accumulate non-zeros c_col_idxs_array.resize_and_reset(c_nnz); c_vals_array.resize_and_reset(c_nnz); auto c_col_idxs = c_col_idxs_array.get_data(); auto c_vals = c_vals_array.get_data(); - cusparse::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals, - a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, b_row_ptrs, - b_col_idxs, null_value, d_descr, zero_nnz, null_value, - null_index, null_index, c_descr, c_vals, c_row_ptrs, - c_col_idxs, info, buffer); - - cusparse::destroy(info); - cusparse::destroy(d_descr); - cusparse::destroy(c_descr); - cusparse::destroy(b_descr); - cusparse::destroy(a_descr); + sparselib::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals, + a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, + b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz, + null_value, null_index, null_index, c_descr, c_vals, + c_row_ptrs, c_col_idxs, info, buffer); + + sparselib::destroy(info); + sparselib::destroy(d_descr); + sparselib::destroy(c_descr); + sparselib::destroy(b_descr); + sparselib::destroy(a_descr); #else // CUDA_VERSION >= 11000 const auto beta = zero(); - auto spgemm_descr = cusparse::create_spgemm_descr(); - auto a_descr = cusparse::create_csr( + auto spgemm_descr = sparselib::create_spgemm_descr(); + auto a_descr = sparselib::create_csr( m, k, a_nnz, const_cast(a_row_ptrs), const_cast(a_col_idxs), const_cast(a_vals)); - auto b_descr = cusparse::create_csr( + auto b_descr = sparselib::create_csr( k, n, b_nnz, const_cast(b_row_ptrs), const_cast(b_col_idxs), const_cast(b_vals)); - auto c_descr = cusparse::create_csr(m, n, zero_nnz, null_index, null_index, - null_value); + auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index, + null_value); // estimate work size_type buffer1_size{}; - cusparse::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta, - c_descr, spgemm_descr, buffer1_size, - nullptr); + sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta, + c_descr, spgemm_descr, buffer1_size, + nullptr); array buffer1{exec, buffer1_size}; - cusparse::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta, - c_descr, spgemm_descr, buffer1_size, - buffer1.get_data()); + sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta, + c_descr, spgemm_descr, buffer1_size, + buffer1.get_data()); // compute spgemm size_type buffer2_size{}; - cusparse::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr, - spgemm_descr, buffer1.get_data(), buffer2_size, - nullptr); + sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr, + spgemm_descr, buffer1.get_data(), buffer2_size, + nullptr); array buffer2{exec, buffer2_size}; - cusparse::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr, - spgemm_descr, buffer1.get_data(), buffer2_size, - buffer2.get_data()); + sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr, + spgemm_descr, buffer1.get_data(), buffer2_size, + buffer2.get_data()); // copy data to result - auto c_nnz = cusparse::sparse_matrix_nnz(c_descr); + auto c_nnz = sparselib::sparse_matrix_nnz(c_descr); c_col_idxs_array.resize_and_reset(c_nnz); c_vals_array.resize_and_reset(c_nnz); - cusparse::csr_set_pointers(c_descr, c_row_ptrs, c_col_idxs_array.get_data(), - c_vals_array.get_data()); + sparselib::csr_set_pointers(c_descr, c_row_ptrs, + c_col_idxs_array.get_data(), + c_vals_array.get_data()); - cusparse::spgemm_copy(handle, &alpha, a_descr, b_descr, &beta, c_descr, - spgemm_descr); + sparselib::spgemm_copy(handle, &alpha, a_descr, b_descr, &beta, c_descr, + spgemm_descr); - cusparse::destroy(c_descr); - cusparse::destroy(b_descr); - cusparse::destroy(a_descr); - cusparse::destroy(spgemm_descr); + sparselib::destroy(c_descr); + sparselib::destroy(b_descr); + sparselib::destroy(a_descr); + sparselib::destroy(spgemm_descr); #endif // CUDA_VERSION >= 11000 } @@ -701,8 +703,8 @@ void advanced_spgemm(std::shared_ptr exec, const matrix::Csr* d, matrix::Csr* c) { - auto handle = exec->get_cusparse_handle(); - cusparse::pointer_mode_guard pm_guard(handle); + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); auto valpha = exec->copy_val_to_host(alpha->get_const_values()); auto a_nnz = IndexType(a->get_num_stored_elements()); @@ -724,102 +726,102 @@ void advanced_spgemm(std::shared_ptr exec, auto c_row_ptrs = c->get_row_ptrs(); #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - if (!cusparse::is_supported::value) { + if (!sparselib::is_supported::value) { GKO_NOT_IMPLEMENTED; } matrix::CsrBuilder c_builder{c}; auto& c_col_idxs_array = c_builder.get_col_idx_array(); auto& c_vals_array = c_builder.get_value_array(); - auto a_descr = cusparse::create_mat_descr(); - auto b_descr = cusparse::create_mat_descr(); - auto c_descr = cusparse::create_mat_descr(); - auto d_descr = cusparse::create_mat_descr(); - auto info = cusparse::create_spgemm_info(); + auto a_descr = sparselib::create_mat_descr(); + auto b_descr = sparselib::create_mat_descr(); + auto c_descr = sparselib::create_mat_descr(); + auto d_descr = sparselib::create_mat_descr(); + auto info = sparselib::create_spgemm_info(); // allocate buffer size_type buffer_size{}; - cusparse::spgemm_buffer_size(handle, m, n, k, &valpha, a_descr, a_nnz, - a_row_ptrs, a_col_idxs, b_descr, b_nnz, - b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz, - d_row_ptrs, d_col_idxs, info, buffer_size); + sparselib::spgemm_buffer_size( + handle, m, n, k, &valpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs, + b_descr, b_nnz, b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz, + d_row_ptrs, d_col_idxs, info, buffer_size); array buffer_array(exec, buffer_size); auto buffer = buffer_array.get_data(); // count nnz IndexType c_nnz{}; - cusparse::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs, - a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, - d_descr, d_nnz, d_row_ptrs, d_col_idxs, c_descr, - c_row_ptrs, &c_nnz, info, buffer); + sparselib::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs, + a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, + d_descr, d_nnz, d_row_ptrs, d_col_idxs, c_descr, + c_row_ptrs, &c_nnz, info, buffer); // accumulate non-zeros c_col_idxs_array.resize_and_reset(c_nnz); c_vals_array.resize_and_reset(c_nnz); auto c_col_idxs = c_col_idxs_array.get_data(); auto c_vals = c_vals_array.get_data(); - cusparse::spgemm(handle, m, n, k, &valpha, a_descr, a_nnz, a_vals, - a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, b_row_ptrs, - b_col_idxs, &vbeta, d_descr, d_nnz, d_vals, d_row_ptrs, - d_col_idxs, c_descr, c_vals, c_row_ptrs, c_col_idxs, info, - buffer); - - cusparse::destroy(info); - cusparse::destroy(d_descr); - cusparse::destroy(c_descr); - cusparse::destroy(b_descr); - cusparse::destroy(a_descr); + sparselib::spgemm(handle, m, n, k, &valpha, a_descr, a_nnz, a_vals, + a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, + b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz, d_vals, + d_row_ptrs, d_col_idxs, c_descr, c_vals, c_row_ptrs, + c_col_idxs, info, buffer); + + sparselib::destroy(info); + sparselib::destroy(d_descr); + sparselib::destroy(c_descr); + sparselib::destroy(b_descr); + sparselib::destroy(a_descr); #else // CUDA_VERSION >= 11000 auto null_value = static_cast(nullptr); auto null_index = static_cast(nullptr); auto one_val = one(); auto zero_val = zero(); auto zero_nnz = IndexType{}; - auto spgemm_descr = cusparse::create_spgemm_descr(); - auto a_descr = cusparse::create_csr( + auto spgemm_descr = sparselib::create_spgemm_descr(); + auto a_descr = sparselib::create_csr( m, k, a_nnz, const_cast(a_row_ptrs), const_cast(a_col_idxs), const_cast(a_vals)); - auto b_descr = cusparse::create_csr( + auto b_descr = sparselib::create_csr( k, n, b_nnz, const_cast(b_row_ptrs), const_cast(b_col_idxs), const_cast(b_vals)); - auto c_descr = cusparse::create_csr(m, n, zero_nnz, null_index, null_index, - null_value); + auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index, + null_value); // estimate work size_type buffer1_size{}; - cusparse::spgemm_work_estimation(handle, &one_val, a_descr, b_descr, - &zero_val, c_descr, spgemm_descr, - buffer1_size, nullptr); + sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr, + &zero_val, c_descr, spgemm_descr, + buffer1_size, nullptr); array buffer1{exec, buffer1_size}; - cusparse::spgemm_work_estimation(handle, &one_val, a_descr, b_descr, - &zero_val, c_descr, spgemm_descr, - buffer1_size, buffer1.get_data()); + sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr, + &zero_val, c_descr, spgemm_descr, + buffer1_size, buffer1.get_data()); // compute spgemm size_type buffer2_size{}; - cusparse::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val, - c_descr, spgemm_descr, buffer1.get_data(), - buffer2_size, nullptr); + sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val, + c_descr, spgemm_descr, buffer1.get_data(), + buffer2_size, nullptr); array buffer2{exec, buffer2_size}; - cusparse::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val, - c_descr, spgemm_descr, buffer1.get_data(), - buffer2_size, buffer2.get_data()); + sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val, + c_descr, spgemm_descr, buffer1.get_data(), + buffer2_size, buffer2.get_data()); // write result to temporary storage - auto c_tmp_nnz = cusparse::sparse_matrix_nnz(c_descr); + auto c_tmp_nnz = sparselib::sparse_matrix_nnz(c_descr); array c_tmp_row_ptrs_array(exec, m + 1); array c_tmp_col_idxs_array(exec, c_tmp_nnz); array c_tmp_vals_array(exec, c_tmp_nnz); - cusparse::csr_set_pointers(c_descr, c_tmp_row_ptrs_array.get_data(), - c_tmp_col_idxs_array.get_data(), - c_tmp_vals_array.get_data()); + sparselib::csr_set_pointers(c_descr, c_tmp_row_ptrs_array.get_data(), + c_tmp_col_idxs_array.get_data(), + c_tmp_vals_array.get_data()); - cusparse::spgemm_copy(handle, &one_val, a_descr, b_descr, &zero_val, - c_descr, spgemm_descr); + sparselib::spgemm_copy(handle, &one_val, a_descr, b_descr, &zero_val, + c_descr, spgemm_descr); - cusparse::destroy(c_descr); - cusparse::destroy(b_descr); - cusparse::destroy(a_descr); - cusparse::destroy(spgemm_descr); + sparselib::destroy(c_descr); + sparselib::destroy(b_descr); + sparselib::destroy(a_descr); + sparselib::destroy(spgemm_descr); auto spgeam_total_nnz = c_tmp_nnz + d->get_num_stored_elements(); auto nnz_per_row = spgeam_total_nnz / m; @@ -846,13 +848,13 @@ void transpose(std::shared_ptr exec, if (orig->get_size()[0] == 0) { return; } - if (cusparse::is_supported::value) { + if (sparselib::is_supported::value) { #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; - cusparse::transpose( - exec->get_cusparse_handle(), orig->get_size()[0], + sparselib::transpose( + exec->get_sparselib_handle(), orig->get_size()[0], orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), @@ -864,8 +866,8 @@ void transpose(std::shared_ptr exec, cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG1; size_type buffer_size = 0; - cusparse::transpose_buffersize( - exec->get_cusparse_handle(), orig->get_size()[0], + sparselib::transpose_buffersize( + exec->get_sparselib_handle(), orig->get_size()[0], orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), @@ -873,8 +875,8 @@ void transpose(std::shared_ptr exec, idxBase, alg, &buffer_size); array buffer_array(exec, buffer_size); auto buffer = buffer_array.get_data(); - cusparse::transpose( - exec->get_cusparse_handle(), orig->get_size()[0], + sparselib::transpose( + exec->get_sparselib_handle(), orig->get_size()[0], orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), @@ -898,13 +900,13 @@ void conj_transpose(std::shared_ptr exec, const auto block_size = default_block_size; const auto grid_size = ceildiv(trans->get_num_stored_elements(), block_size); - if (cusparse::is_supported::value) { + if (sparselib::is_supported::value) { #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; - cusparse::transpose( - exec->get_cusparse_handle(), orig->get_size()[0], + sparselib::transpose( + exec->get_sparselib_handle(), orig->get_size()[0], orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), @@ -916,8 +918,8 @@ void conj_transpose(std::shared_ptr exec, cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG1; size_type buffer_size = 0; - cusparse::transpose_buffersize( - exec->get_cusparse_handle(), orig->get_size()[0], + sparselib::transpose_buffersize( + exec->get_sparselib_handle(), orig->get_size()[0], orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), @@ -925,8 +927,8 @@ void conj_transpose(std::shared_ptr exec, idxBase, alg, &buffer_size); array buffer_array(exec, buffer_size); auto buffer = buffer_array.get_data(); - cusparse::transpose( - exec->get_cusparse_handle(), orig->get_size()[0], + sparselib::transpose( + exec->get_sparselib_handle(), orig->get_size()[0], orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), @@ -948,9 +950,9 @@ template void sort_by_column_index(std::shared_ptr exec, matrix::Csr* to_sort) { - if (cusparse::is_supported::value) { - auto handle = exec->get_cusparse_handle(); - auto descr = cusparse::create_mat_descr(); + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + auto descr = sparselib::create_mat_descr(); auto m = IndexType(to_sort->get_size()[0]); auto n = IndexType(to_sort->get_size()[1]); auto nnz = IndexType(to_sort->get_num_stored_elements()); @@ -966,30 +968,30 @@ void sort_by_column_index(std::shared_ptr exec, // init identity permutation array permutation_array(exec, nnz); auto permutation = permutation_array.get_data(); - cusparse::create_identity_permutation(handle, nnz, permutation); + sparselib::create_identity_permutation(handle, nnz, permutation); // allocate buffer size_type buffer_size{}; - cusparse::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs, - buffer_size); + sparselib::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs, + buffer_size); array buffer_array{exec, buffer_size}; auto buffer = buffer_array.get_data(); // sort column indices - cusparse::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs, - permutation, buffer); + sparselib::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs, + permutation, buffer); // sort values #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cusparse::gather(handle, nnz, tmp_vals, vals, permutation); + sparselib::gather(handle, nnz, tmp_vals, vals, permutation); #else // CUDA_VERSION >= 11000 - auto val_vec = cusparse::create_spvec(nnz, nnz, permutation, vals); + auto val_vec = sparselib::create_spvec(nnz, nnz, permutation, vals); auto tmp_vec = - cusparse::create_dnvec(nnz, const_cast(tmp_vals)); - cusparse::gather(handle, tmp_vec, val_vec); + sparselib::create_dnvec(nnz, const_cast(tmp_vals)); + sparselib::gather(handle, tmp_vec, val_vec); #endif - cusparse::destroy(descr); + sparselib::destroy(descr); } else { fallback_sort(exec, to_sort); } diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu index 04b34953c6a..b117c39107b 100644 --- a/cuda/matrix/dense_kernels.cu +++ b/cuda/matrix/dense_kernels.cu @@ -17,12 +17,13 @@ #include +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/base/pointer_mode_guard.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" @@ -53,11 +54,11 @@ void compute_dot_dispatch(std::shared_ptr exec, matrix::Dense* result, array& tmp) { if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); - cublas::dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), y->get_stride(), - result->get_values()); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + blas::dot(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), y->get_const_values(), y->get_stride(), + result->get_values()); } else { compute_dot(exec, x, y, result, tmp); } @@ -78,11 +79,11 @@ void compute_conj_dot_dispatch(std::shared_ptr exec, array& tmp) { if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); - cublas::conj_dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), - y->get_stride(), result->get_values()); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + blas::conj_dot(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), y->get_const_values(), + y->get_stride(), result->get_values()); } else { compute_conj_dot(exec, x, y, result, tmp); } @@ -102,10 +103,10 @@ void compute_norm2_dispatch(std::shared_ptr exec, array& tmp) { if (x->get_size()[1] == 1) { - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); - cublas::norm2(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), result->get_values()); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + blas::norm2(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), result->get_values()); } else { compute_norm2(exec, x, result, tmp); } @@ -124,18 +125,18 @@ void simple_apply(std::shared_ptr exec, const matrix::Dense* b, matrix::Dense* c) { - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { if (a->get_size()[1] > 0) { - cublas::pointer_mode_guard pm_guard(handle); + blas::pointer_mode_guard pm_guard(handle); auto alpha = one(); auto beta = zero(); - cublas::gemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, c->get_size()[1], - c->get_size()[0], a->get_size()[1], &alpha, - b->get_const_values(), b->get_stride(), - a->get_const_values(), a->get_stride(), &beta, - c->get_values(), c->get_stride()); + blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1], + c->get_size()[0], a->get_size()[1], &alpha, + b->get_const_values(), b->get_stride(), + a->get_const_values(), a->get_stride(), &beta, + c->get_values(), c->get_stride()); } else { dense::fill(exec, c, zero()); } @@ -154,15 +155,15 @@ void apply(std::shared_ptr exec, const matrix::Dense* a, const matrix::Dense* b, const matrix::Dense* beta, matrix::Dense* c) { - if (cublas::is_supported::value) { + if (blas::is_supported::value) { if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { if (a->get_size()[1] > 0) { - cublas::gemm( - exec->get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, - c->get_size()[1], c->get_size()[0], a->get_size()[1], - alpha->get_const_values(), b->get_const_values(), - b->get_stride(), a->get_const_values(), a->get_stride(), - beta->get_const_values(), c->get_values(), c->get_stride()); + blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N, + c->get_size()[1], c->get_size()[0], a->get_size()[1], + alpha->get_const_values(), b->get_const_values(), + b->get_stride(), a->get_const_values(), + a->get_stride(), beta->get_const_values(), + c->get_values(), c->get_stride()); } else { dense::scale(exec, beta, c); } @@ -180,17 +181,17 @@ void transpose(std::shared_ptr exec, const matrix::Dense* orig, matrix::Dense* trans) { - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { - cublas::pointer_mode_guard pm_guard(handle); + blas::pointer_mode_guard pm_guard(handle); auto alpha = one(); auto beta = zero(); - cublas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, orig->get_size()[0], - orig->get_size()[1], &alpha, orig->get_const_values(), - orig->get_stride(), &beta, trans->get_values(), - trans->get_stride(), trans->get_values(), - trans->get_stride()); + blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0], + orig->get_size()[1], &alpha, orig->get_const_values(), + orig->get_stride(), &beta, trans->get_const_values(), + trans->get_stride(), trans->get_values(), + trans->get_stride()); } } else { GKO_NOT_IMPLEMENTED; @@ -205,17 +206,17 @@ void conj_transpose(std::shared_ptr exec, const matrix::Dense* orig, matrix::Dense* trans) { - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { - cublas::pointer_mode_guard pm_guard(handle); + blas::pointer_mode_guard pm_guard(handle); auto alpha = one(); auto beta = zero(); - cublas::geam(handle, CUBLAS_OP_C, CUBLAS_OP_N, orig->get_size()[0], - orig->get_size()[1], &alpha, orig->get_const_values(), - orig->get_stride(), &beta, trans->get_values(), - trans->get_stride(), trans->get_values(), - trans->get_stride()); + blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0], + orig->get_size()[1], &alpha, orig->get_const_values(), + orig->get_stride(), &beta, trans->get_const_values(), + trans->get_stride(), trans->get_values(), + trans->get_stride()); } } else { GKO_NOT_IMPLEMENTED; diff --git a/cuda/matrix/diagonal_kernels.cu b/cuda/matrix/diagonal_kernels.cu index b81329e29a0..e362ff0462b 100644 --- a/cuda/matrix/diagonal_kernels.cu +++ b/cuda/matrix/diagonal_kernels.cu @@ -9,9 +9,10 @@ #include -#include "cuda/base/config.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index 9c23abc9dc4..105122ec4a9 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -15,19 +15,20 @@ #include -#include "accessor/cuda_helper.hpp" +#include "accessor/cuda_hip_helper.hpp" #include "accessor/reduced_row_major.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cusparse_bindings.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/format_conversion.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" @@ -97,9 +98,9 @@ void abstract_spmv(syn::value_list, using arithmetic_type = highest_precision; using a_accessor = - gko::acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>; + acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>; using b_accessor = - gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>; + acc::reduced_row_major<2, arithmetic_type, const InputValueType>; const auto nrows = a->get_size()[0]; const auto stride = a->get_stride(); @@ -114,11 +115,11 @@ void abstract_spmv(syn::value_list, const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x), b->get_size()[1], 1); - const auto a_vals = gko::acc::range( + const auto a_vals = acc::range( std::array{{static_cast( num_stored_elements_per_row * stride)}}, a->get_const_values()); - const auto b_vals = gko::acc::range( + const auto b_vals = acc::range( std::array{ {static_cast(b->get_size()[0]), static_cast(b->get_size()[1])}}, @@ -130,20 +131,21 @@ void abstract_spmv(syn::value_list, if (grid_size.x > 0 && grid_size.y > 0) { kernel::spmv <<get_stream()>>>( - nrows, num_worker_per_row, acc::as_cuda_range(a_vals), + nrows, num_worker_per_row, acc::as_device_range(a_vals), a->get_const_col_idxs(), stride, - num_stored_elements_per_row, acc::as_cuda_range(b_vals), + num_stored_elements_per_row, acc::as_device_range(b_vals), as_device_type(c->get_values()), c->get_stride()); } } else if (alpha != nullptr && beta != nullptr) { - const auto alpha_val = gko::acc::range( + const auto alpha_val = acc::range( std::array{1}, alpha->get_const_values()); if (grid_size.x > 0 && grid_size.y > 0) { kernel::spmv <<get_stream()>>>( - nrows, num_worker_per_row, acc::as_cuda_range(alpha_val), - acc::as_cuda_range(a_vals), a->get_const_col_idxs(), stride, - num_stored_elements_per_row, acc::as_cuda_range(b_vals), + nrows, num_worker_per_row, acc::as_device_range(alpha_val), + acc::as_device_range(a_vals), a->get_const_col_idxs(), + stride, num_stored_elements_per_row, + acc::as_device_range(b_vals), as_device_type(beta->get_const_values()), as_device_type(c->get_values()), c->get_stride()); } @@ -212,7 +214,7 @@ void spmv(std::shared_ptr exec, const int num_worker_per_row = std::get<2>(data); /** - * info is the parameter for selecting the cuda kernel. + * info is the parameter for selecting the device kernel. * for info == 0, it uses the kernel by warp_size threads with atomic * operation for other value, it uses the kernel without atomic_add */ @@ -246,7 +248,7 @@ void advanced_spmv(std::shared_ptr exec, const int num_worker_per_row = std::get<2>(data); /** - * info is the parameter for selecting the cuda kernel. + * info is the parameter for selecting the device kernel. * for info == 0, it uses the kernel by warp_size threads with atomic * operation for other value, it uses the kernel without atomic_add */ diff --git a/cuda/matrix/fbcsr_kernels.template.cu b/cuda/matrix/fbcsr_kernels.template.cu index 8b835c6fd7d..ad36c84216e 100644 --- a/cuda/matrix/fbcsr_kernels.template.cu +++ b/cuda/matrix/fbcsr_kernels.template.cu @@ -24,6 +24,13 @@ #include +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/unified/base/kernel_launch.hpp" #include "core/base/array_access.hpp" #include "core/base/block_sizes.hpp" @@ -33,16 +40,10 @@ #include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/cusparse_block_bindings.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/pointer_mode_guard.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/merging.cuh" #include "cuda/components/prefix_sum.cuh" #include "cuda/components/reduction.cuh" @@ -72,6 +73,7 @@ constexpr int default_block_size{512}; namespace { + template void dense_transpose(std::shared_ptr exec, const size_type nrows, const size_type ncols, @@ -81,21 +83,22 @@ void dense_transpose(std::shared_ptr exec, if (nrows == 0) { return; } - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); { - cublas::pointer_mode_guard pm_guard(handle); + blas::pointer_mode_guard pm_guard(handle); auto alpha = one(); auto beta = zero(); - cublas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, nrows, ncols, &alpha, - orig, orig_stride, &beta, trans, trans_stride, trans, - trans_stride); + blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig, + orig_stride, &beta, trans, trans_stride, trans, + trans_stride); } } else { GKO_NOT_IMPLEMENTED; } } + } // namespace @@ -114,12 +117,12 @@ void spmv(std::shared_ptr exec, dense::fill(exec, c, zero()); return; } - if (cusparse::is_supported::value) { - auto handle = exec->get_cusparse_handle(); - cusparse::pointer_mode_guard pm_guard(handle); + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); const auto alpha = one(); const auto beta = zero(); - auto descr = cusparse::create_mat_descr(); + auto descr = sparselib::create_mat_descr(); const auto row_ptrs = a->get_const_row_ptrs(); const auto col_idxs = a->get_const_col_idxs(); const auto values = a->get_const_values(); @@ -133,21 +136,21 @@ void spmv(std::shared_ptr exec, const auto in_stride = b->get_stride(); const auto out_stride = c->get_stride(); if (nrhs == 1 && in_stride == 1 && out_stride == 1) { - cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, - nnzb, &alpha, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), &beta, c->get_values()); + sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, + nnzb, &alpha, descr, values, row_ptrs, col_idxs, + bs, b->get_const_values(), &beta, c->get_values()); } else { const auto trans_stride = nrows; auto trans_c = array(exec, nrows * nrhs); - cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, - &alpha, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), in_stride, &beta, - trans_c.get_data(), trans_stride); + sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, + &alpha, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), in_stride, &beta, + trans_c.get_data(), trans_stride); dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), out_stride, c->get_values()); } - cusparse::destroy(descr); + sparselib::destroy(descr); } else { GKO_NOT_IMPLEMENTED; } @@ -171,11 +174,11 @@ void advanced_spmv(std::shared_ptr exec, dense::scale(exec, beta, c); return; } - if (cusparse::is_supported::value) { - auto handle = exec->get_cusparse_handle(); + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); const auto alphp = alpha->get_const_values(); const auto betap = beta->get_const_values(); - auto descr = cusparse::create_mat_descr(); + auto descr = sparselib::create_mat_descr(); const auto row_ptrs = a->get_const_row_ptrs(); const auto col_idxs = a->get_const_col_idxs(); const auto values = a->get_const_values(); @@ -189,23 +192,23 @@ void advanced_spmv(std::shared_ptr exec, const auto in_stride = b->get_stride(); const auto out_stride = c->get_stride(); if (nrhs == 1 && in_stride == 1 && out_stride == 1) { - cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, - nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), betap, c->get_values()); + sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, + nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), betap, c->get_values()); } else { const auto trans_stride = nrows; auto trans_c = array(exec, nrows * nrhs); dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(), trans_stride, trans_c.get_data()); - cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, - alphp, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), in_stride, betap, - trans_c.get_data(), trans_stride); + sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, + alphp, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), in_stride, betap, + trans_c.get_data(), trans_stride); dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), out_stride, c->get_values()); } - cusparse::destroy(descr); + sparselib::destroy(descr); } else { GKO_NOT_IMPLEMENTED; } @@ -244,20 +247,21 @@ void transpose(const std::shared_ptr exec, const matrix::Fbcsr* const orig, matrix::Fbcsr* const trans) { - if (cusparse::is_supported::value) { +#ifdef GKO_COMPILING_CUDA + if (sparselib::is_supported::value) { const int bs = orig->get_block_size(); const IndexType nnzb = static_cast(orig->get_num_stored_blocks()); cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; - const IndexType buffer_size = cusparse::bsr_transpose_buffersize( - exec->get_cusparse_handle(), orig->get_num_block_rows(), + const IndexType buffer_size = sparselib::bsr_transpose_buffersize( + exec->get_sparselib_handle(), orig->get_num_block_rows(), orig->get_num_block_cols(), nnzb, orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs); array buffer_array(exec, buffer_size); auto buffer = buffer_array.get_data(); - cusparse::bsr_transpose( - exec->get_cusparse_handle(), orig->get_num_block_rows(), + sparselib::bsr_transpose( + exec->get_sparselib_handle(), orig->get_num_block_rows(), orig->get_num_block_cols(), nnzb, orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs, trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(), @@ -268,7 +272,9 @@ void transpose(const std::shared_ptr exec, fixedblock::compiled_kernels(), [bs](int compiled_block_size) { return bs == compiled_block_size; }, syn::value_list(), syn::type_list<>(), exec, trans); - } else { + } else +#endif + { fallback_transpose(exec, orig, trans); } } diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu index 5eadf0d3858..d6c20075ef4 100644 --- a/cuda/matrix/sellp_kernels.cu +++ b/cuda/matrix/sellp_kernels.cu @@ -12,10 +12,11 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" #include "core/components/prefix_sum_kernels.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cusparse_bindings.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu index 3a3d60b24e0..311e4d3782c 100644 --- a/cuda/matrix/sparsity_csr_kernels.cu +++ b/cuda/matrix/sparsity_csr_kernels.cu @@ -11,18 +11,19 @@ #include -#include "accessor/cuda_helper.hpp" +#include "accessor/cuda_hip_helper.hpp" #include "accessor/reduced_row_major.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" @@ -41,7 +42,11 @@ namespace sparsity_csr { constexpr int classical_oversubscription = 32; constexpr int default_block_size = 512; +#ifdef GKO_COMPILING_HIP +constexpr int spmv_block_size = 256; +#else constexpr int spmv_block_size = 128; +#endif constexpr int warps_in_block = 4; @@ -105,16 +110,16 @@ void classical_spmv(syn::value_list, a->get_size()[0], as_device_type(a->get_const_value()), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), - acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals)); + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); } else if (alpha != nullptr && beta != nullptr) { kernel::abstract_classical_spmv <<get_stream()>>>( a->get_size()[0], as_device_type(alpha->get_const_values()), as_device_type(a->get_const_value()), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), - acc::as_cuda_range(b_vals), + acc::as_device_range(b_vals), as_device_type(beta->get_const_values()), - acc::as_cuda_range(c_vals)); + acc::as_device_range(c_vals)); } else { GKO_KERNEL_NOT_FOUND; } @@ -168,21 +173,21 @@ void sort_by_column_index(std::shared_ptr exec, const auto num_cols = static_cast(to_sort->get_size()[1]); const auto row_ptrs = to_sort->get_const_row_ptrs(); const auto col_idxs = to_sort->get_col_idxs(); - if (cusparse::is_supported::value) { - const auto handle = exec->get_cusparse_handle(); - auto descr = cusparse::create_mat_descr(); + if (sparselib::is_supported::value) { + const auto handle = exec->get_sparselib_handle(); + auto descr = sparselib::create_mat_descr(); array permutation_array(exec, to_sort->get_num_nonzeros()); auto permutation = permutation_array.get_data(); components::fill_seq_array(exec, permutation, to_sort->get_num_nonzeros()); size_type buffer_size{}; - cusparse::csrsort_buffer_size(handle, num_rows, num_cols, nnz, row_ptrs, - col_idxs, buffer_size); + sparselib::csrsort_buffer_size(handle, num_rows, num_cols, nnz, + row_ptrs, col_idxs, buffer_size); array buffer_array{exec, buffer_size}; auto buffer = buffer_array.get_data(); - cusparse::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs, - col_idxs, permutation, buffer); - cusparse::destroy(descr); + sparselib::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs, + col_idxs, permutation, buffer); + sparselib::destroy(descr); } else { fallback_sort(exec, to_sort); } diff --git a/cuda/multigrid/pgm_kernels.cu b/cuda/multigrid/pgm_kernels.cu index b5e9fa1612d..75c3dd911ad 100644 --- a/cuda/multigrid/pgm_kernels.cu +++ b/cuda/multigrid/pgm_kernels.cu @@ -8,8 +8,6 @@ #include -#include -#include #include #include #include @@ -21,8 +19,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" namespace gko { diff --git a/cuda/preconditioner/batch_preconditioners.cuh b/cuda/preconditioner/batch_preconditioners.cuh index 0eae8650bdc..e83d6e04ee9 100644 --- a/cuda/preconditioner/batch_preconditioners.cuh +++ b/cuda/preconditioner/batch_preconditioners.cuh @@ -6,9 +6,9 @@ #define GKO_CUDA_PRECONDITIONER_BATCH_PRECONDITIONERS_CUH_ +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/matrix/batch_struct.hpp" #include "core/preconditioner/batch_jacobi_helpers.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" diff --git a/cuda/preconditioner/isai_kernels.cu b/cuda/preconditioner/isai_kernels.cu index 6551f32bb86..d0dd516466a 100644 --- a/cuda/preconditioner/isai_kernels.cu +++ b/cuda/preconditioner/isai_kernels.cu @@ -10,12 +10,13 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/merging.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernel.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.cu similarity index 100% rename from cuda/preconditioner/jacobi_advanced_apply_kernel.cu rename to cuda/preconditioner/jacobi_advanced_apply_kernels.cu diff --git a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu similarity index 94% rename from cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu rename to cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu index 5633ad15a4b..ed33437c613 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu @@ -8,14 +8,14 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/warp_blas.cuh" #include "cuda/preconditioner/jacobi_common.hpp" @@ -32,7 +32,7 @@ namespace cuda { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc" +#include // clang-format off diff --git a/cuda/preconditioner/jacobi_common.hpp.in b/cuda/preconditioner/jacobi_common.hpp.in index fe99fd88227..aeb47fec97e 100644 --- a/cuda/preconditioner/jacobi_common.hpp.in +++ b/cuda/preconditioner/jacobi_common.hpp.in @@ -6,7 +6,7 @@ #include -#include "cuda/base/config.hpp" +#include "common/cuda_hip/base/config.hpp" namespace gko { namespace kernels { diff --git a/cuda/preconditioner/jacobi_generate_kernel.cu b/cuda/preconditioner/jacobi_generate_kernels.cu similarity index 100% rename from cuda/preconditioner/jacobi_generate_kernel.cu rename to cuda/preconditioner/jacobi_generate_kernels.cu diff --git a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu similarity index 94% rename from cuda/preconditioner/jacobi_generate_instantiate.inc.cu rename to cuda/preconditioner/jacobi_generate_kernels.instantiate.cu index a76c4fba271..56e8ff6f16f 100644 --- a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu +++ b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu @@ -9,14 +9,14 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/diagonal_block_manipulation.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" @@ -35,7 +35,7 @@ namespace cuda { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc" +#include // clang-format off diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu index 2508f33acb9..bce2ff23303 100644 --- a/cuda/preconditioner/jacobi_kernels.cu +++ b/cuda/preconditioner/jacobi_kernels.cu @@ -8,13 +8,14 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/preconditioner/jacobi_common.hpp" @@ -30,8 +31,12 @@ namespace cuda { namespace jacobi { -// a total of 32 warps (1024 threads) +// a total of 32/16 warps (1024 threads) +#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC +constexpr int default_num_warps = 16; +#else // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC constexpr int default_num_warps = 32; +#endif // with current architectures, at most 32 warps can be scheduled per SM (and // current GPUs have at most 84 SMs) constexpr int default_grid_size = 32 * 32 * 128; diff --git a/cuda/preconditioner/jacobi_simple_apply_kernel.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.cu similarity index 100% rename from cuda/preconditioner/jacobi_simple_apply_kernel.cu rename to cuda/preconditioner/jacobi_simple_apply_kernels.cu diff --git a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu similarity index 93% rename from cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu rename to cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu index 07689daa815..97a7bfff489 100644 --- a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu +++ b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu @@ -8,14 +8,14 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/warp_blas.cuh" #include "cuda/preconditioner/jacobi_common.hpp" @@ -32,7 +32,7 @@ namespace cuda { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc" +#include // clang-format off diff --git a/cuda/reorder/rcm_kernels.cu b/cuda/reorder/rcm_kernels.cu index d699d00dfb6..72322016fba 100644 --- a/cuda/reorder/rcm_kernels.cu +++ b/cuda/reorder/rcm_kernels.cu @@ -25,9 +25,9 @@ #include +#include "common/cuda_hip/components/memory.hpp" #include "core/base/array_access.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/components/memory.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 0ce95e2d34f..58e1a6b7b0d 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -13,15 +13,16 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/kernel_config.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index f429e5f22f0..398e831eb09 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -13,15 +13,15 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/kernel_config.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu index 107835ca1b5..3dbefadf22a 100644 --- a/cuda/solver/cb_gmres_kernels.cu +++ b/cuda/solver/cb_gmres_kernels.cu @@ -8,25 +8,25 @@ #include +#include #include -#include #include #include -#include "accessor/cuda_helper.hpp" +#include "accessor/cuda_hip_helper.hpp" #include "accessor/range.hpp" #include "accessor/reduced_row_major.hpp" #include "accessor/scaled_reduced_row_major.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/array_access.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/solver/cb_gmres_accessor.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" @@ -44,6 +44,8 @@ namespace cb_gmres { constexpr int default_block_size = 512; +// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block +// size limit. constexpr int default_dot_dim = 32; constexpr int default_dot_size = default_dot_dim * default_dot_dim; @@ -116,7 +118,7 @@ void restart(std::shared_ptr exec, restart_1_kernel <<get_stream()>>>( residual->get_size()[0], residual->get_size()[1], krylov_dim, - acc::as_cuda_range(krylov_bases), + acc::as_device_range(krylov_bases), as_device_type(residual_norm_collection->get_values()), residual_norm_collection->get_stride()); kernels::cuda::dense::compute_norm2_dispatch(exec, residual, residual_norm, @@ -145,7 +147,7 @@ void restart(std::shared_ptr exec, residual_norm->get_stride(), as_device_type(arnoldi_norm->get_const_values() + 2 * stride_arnoldi), - stride_arnoldi, acc::as_cuda_range(krylov_bases)); + stride_arnoldi, acc::as_device_range(krylov_bases)); } const auto grid_dim_2 = @@ -158,7 +160,7 @@ void restart(std::shared_ptr exec, residual->get_stride(), as_device_type(residual_norm->get_const_values()), as_device_type(residual_norm_collection->get_values()), - acc::as_cuda_range(krylov_bases), + acc::as_device_range(krylov_bases), as_device_type(next_krylov_basis->get_values()), next_krylov_basis->get_stride(), as_device_type(final_iter_nums->get_data())); @@ -212,6 +214,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, as_device_type(next_krylov_basis->get_const_values()), stride_next_krylov, as_device_type(arnoldi_norm->get_values()), as_device_type(stop_status)); + // nrmP = norm(next_krylov_basis) zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg, hessenberg_iter->get_values()); if (dim_size[1] > 1) { @@ -219,7 +222,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, <<get_stream()>>>( dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_cuda_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_values()), stride_hessenberg, as_device_type(stop_status)); } else { @@ -228,7 +231,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, exec->get_stream()>>>( dim_size[0], as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_cuda_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_values()), stride_hessenberg, as_device_type(stop_status)); } @@ -240,7 +243,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, default_block_size, 0, exec->get_stream()>>>( iter + 1, dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_values()), stride_next_krylov, - acc::as_cuda_range(krylov_bases), + acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_const_values()), stride_hessenberg, as_device_type(stop_status)); @@ -269,7 +272,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, exec->get_stream()>>>( dim_size[1], as_device_type(arnoldi_norm->get_values()), stride_arnoldi, as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, iter + 1, acc::as_cuda_range(krylov_bases), + stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases), as_device_type(stop_status), as_device_type(reorth_status), as_device_type(num_reorth->get_data())); num_reorth_host = get_element(*num_reorth, 0); @@ -282,7 +285,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, <<get_stream()>>>( dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_cuda_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(buffer_iter->get_values()), stride_buffer, as_device_type(stop_status)); } else { @@ -291,7 +294,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, exec->get_stream()>>>( dim_size[0], as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_cuda_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(buffer_iter->get_values()), stride_buffer, as_device_type(stop_status)); } @@ -303,7 +306,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, default_block_size, 0, exec->get_stream()>>>( iter + 1, dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_values()), - stride_next_krylov, acc::as_cuda_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_values()), stride_hessenberg, as_device_type(buffer_iter->get_const_values()), stride_buffer, @@ -335,18 +338,19 @@ void finish_arnoldi_CGS(std::shared_ptr exec, exec->get_stream()>>>( dim_size[1], as_device_type(arnoldi_norm->get_values()), stride_arnoldi, as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, iter + 1, acc::as_cuda_range(krylov_bases), + stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases), as_device_type(stop_status), as_device_type(reorth_status), num_reorth->get_data()); num_reorth_host = get_element(*num_reorth, 0); + // num_reorth_host := number of next_krylov vector to be + // reorthogonalization } - update_krylov_next_krylov_kernel <<get_stream()>>>( iter, dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_values()), stride_next_krylov, - acc::as_cuda_range(krylov_bases), + acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_const_values()), stride_hessenberg, as_device_type(stop_status)); // next_krylov_basis /= hessenberg(iter, iter + 1) @@ -460,7 +464,7 @@ void calculate_qy(std::shared_ptr exec, calculate_Qy_kernel <<get_stream()>>>( - num_rows, num_cols, acc::as_cuda_range(krylov_bases), + num_rows, num_cols, acc::as_device_range(krylov_bases), as_device_type(y->get_const_values()), y->get_stride(), as_device_type(before_preconditioner->get_values()), stride_before_preconditioner, diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index cb627e04b53..549925bf2e7 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -20,15 +20,15 @@ #include +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "core/base/array_access.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/pointer_mode_guard.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" -#include "cuda/components/memory.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" @@ -75,18 +75,18 @@ struct CudaSolveStruct : gko::solver::SolveStruct { if (num_rhs == 0) { return; } - cusparse::pointer_mode_guard pm_guard(handle); - spsm_descr = cusparse::create_spsm_descr(); - descr_a = cusparse::create_csr( + sparselib::pointer_mode_guard pm_guard(handle); + spsm_descr = sparselib::create_spsm_descr(); + descr_a = sparselib::create_csr( matrix->get_size()[0], matrix->get_size()[1], matrix->get_num_stored_elements(), const_cast(matrix->get_const_row_ptrs()), const_cast(matrix->get_const_col_idxs()), const_cast(matrix->get_const_values())); - cusparse::set_attribute( + sparselib::set_attribute( descr_a, CUSPARSE_SPMAT_FILL_MODE, is_upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER); - cusparse::set_attribute( + sparselib::set_attribute( descr_a, CUSPARSE_SPMAT_DIAG_TYPE, unit_diag ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT); @@ -94,28 +94,28 @@ struct CudaSolveStruct : gko::solver::SolveStruct { // workaround suggested by NVIDIA engineers: for some reason // cusparse needs non-nullptr input vectors even for analysis // also make sure they are aligned by 16 bytes - auto descr_b = cusparse::create_dnmat( + auto descr_b = sparselib::create_dnmat( dim<2>{matrix->get_size()[0], num_rhs}, matrix->get_size()[1], reinterpret_cast(0xDEAD0)); - auto descr_c = cusparse::create_dnmat( + auto descr_c = sparselib::create_dnmat( dim<2>{matrix->get_size()[0], num_rhs}, matrix->get_size()[1], reinterpret_cast(0xDEAF0)); - auto work_size = cusparse::spsm_buffer_size( + auto work_size = sparselib::spsm_buffer_size( handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, one(), descr_a, descr_b, descr_c, CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr); work.resize_and_reset(work_size); - cusparse::spsm_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, - one(), descr_a, descr_b, descr_c, - CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr, - work.get_data()); + sparselib::spsm_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + one(), descr_a, descr_b, descr_c, + CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr, + work.get_data()); - cusparse::destroy(descr_b); - cusparse::destroy(descr_c); + sparselib::destroy(descr_b); + sparselib::destroy(descr_c); } void solve(const matrix::Csr*, @@ -134,30 +134,30 @@ struct CudaSolveStruct : gko::solver::SolveStruct { "provided at generation time. Check the value specified in " ".with_num_rhs(...)."}; } - cusparse::pointer_mode_guard pm_guard(handle); - auto descr_b = cusparse::create_dnmat( + sparselib::pointer_mode_guard pm_guard(handle); + auto descr_b = sparselib::create_dnmat( input->get_size(), input->get_stride(), const_cast(input->get_const_values())); - auto descr_c = cusparse::create_dnmat( + auto descr_c = sparselib::create_dnmat( output->get_size(), output->get_stride(), output->get_values()); - cusparse::spsm_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, one(), - descr_a, descr_b, descr_c, - CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr); + sparselib::spsm_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + one(), descr_a, descr_b, descr_c, + CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr); - cusparse::destroy(descr_b); - cusparse::destroy(descr_c); + sparselib::destroy(descr_b); + sparselib::destroy(descr_c); } ~CudaSolveStruct() { if (descr_a) { - cusparse::destroy(descr_a); + sparselib::destroy(descr_a); descr_a = nullptr; } if (spsm_descr) { - cusparse::destroy(spsm_descr); + sparselib::destroy(spsm_descr); spsm_descr = nullptr; } } @@ -200,21 +200,21 @@ struct CudaSolveStruct : gko::solver::SolveStruct { if (num_rhs == 0) { return; } - cusparse::pointer_mode_guard pm_guard(handle); - factor_descr = cusparse::create_mat_descr(); - solve_info = cusparse::create_solve_info(); - cusparse::set_mat_fill_mode( + sparselib::pointer_mode_guard pm_guard(handle); + factor_descr = sparselib::create_mat_descr(); + solve_info = sparselib::create_solve_info(); + sparselib::set_mat_fill_mode( factor_descr, is_upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER); - cusparse::set_mat_diag_type( + sparselib::set_mat_diag_type( factor_descr, unit_diag ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT); algorithm = 0; - policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; + policy = SPARSELIB_SOLVE_POLICY_USE_LEVEL; size_type work_size{}; - cusparse::buffer_size_ext( + sparselib::buffer_size_ext( handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, @@ -225,7 +225,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct { // allocate workspace work.resize_and_reset(work_size); - cusparse::csrsm2_analysis( + sparselib::csrsm2_analysis( handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, @@ -250,9 +250,9 @@ struct CudaSolveStruct : gko::solver::SolveStruct { "provided at generation time. Check the value specified in " ".with_num_rhs(...)."}; } - cusparse::pointer_mode_guard pm_guard(handle); + sparselib::pointer_mode_guard pm_guard(handle); dense::copy(exec, input, output); - cusparse::csrsm2_solve( + sparselib::csrsm2_solve( handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], output->get_stride(), matrix->get_num_stored_elements(), @@ -265,11 +265,11 @@ struct CudaSolveStruct : gko::solver::SolveStruct { ~CudaSolveStruct() { if (factor_descr) { - cusparse::destroy(factor_descr); + sparselib::destroy(factor_descr); factor_descr = nullptr; } if (solve_info) { - cusparse::destroy(solve_info); + sparselib::destroy(solve_info); solve_info = nullptr; } } @@ -304,7 +304,7 @@ void generate_kernel(std::shared_ptr exec, if (matrix->get_size()[0] == 0) { return; } - if (cusparse::is_supported::value) { + if (sparselib::is_supported::value) { solve_struct = std::make_shared>( exec, matrix, num_rhs, is_upper, unit_diag); } else { @@ -327,7 +327,7 @@ void solve_kernel(std::shared_ptr exec, } using vec = matrix::Dense; - if (cusparse::is_supported::value) { + if (sparselib::is_supported::value) { if (auto cuda_solve_struct = dynamic_cast*>( solve_struct)) { diff --git a/cuda/solver/idr_kernels.cu b/cuda/solver/idr_kernels.cu index 9c97d99f13c..f7e89c9d9d8 100644 --- a/cuda/solver/idr_kernels.cu +++ b/cuda/solver/idr_kernels.cu @@ -13,14 +13,15 @@ #include +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/randlib_bindings.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/fill_array_kernels.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/base/curand_bindings.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" @@ -69,14 +70,14 @@ void initialize_subspace_vectors(std::shared_ptr exec, bool deterministic) { if (!deterministic) { - auto gen = curand::rand_generator(std::random_device{}(), - CURAND_RNG_PSEUDO_DEFAULT, - exec->get_stream()); - curand::rand_vector( + auto gen = randlib::rand_generator(std::random_device{}(), + RANDLIB_RNG_PSEUDO_DEFAULT, + exec->get_stream()); + randlib::rand_vector( gen, subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), 0.0, 1.0, subspace_vectors->get_values()); - curand::destroy(gen); + randlib::destroy(gen); } } @@ -145,9 +146,8 @@ void update_g_and_u(std::shared_ptr exec, as_device_type(alpha->get_values()), stop_status->get_const_data()); } else { - cublas::dot(exec->get_cublas_handle(), size, p_i, 1, - g_k->get_values(), g_k->get_stride(), - alpha->get_values()); + blas::dot(exec->get_blas_handle(), size, p_i, 1, g_k->get_values(), + g_k->get_stride(), alpha->get_values()); } update_g_k_and_u_kernel <<get_stride(), default_block_size), @@ -196,8 +196,8 @@ void update_m(std::shared_ptr exec, const size_type nrhs, as_device_type(g_k->get_const_values()), g_k->get_stride(), as_device_type(m_i), stop_status->get_const_data()); } else { - cublas::dot(exec->get_cublas_handle(), size, p_i, 1, - g_k->get_const_values(), g_k->get_stride(), m_i); + blas::dot(exec->get_blas_handle(), size, p_i, 1, + g_k->get_const_values(), g_k->get_stride(), m_i); } } } diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu index 46b4cb4c2e4..002cc0140cb 100644 --- a/cuda/solver/lower_trs_kernels.cu +++ b/cuda/solver/lower_trs_kernels.cu @@ -17,9 +17,9 @@ #include -#include "cuda/base/cusparse_bindings.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" #include "cuda/solver/common_trs_kernels.cuh" diff --git a/cuda/solver/multigrid_kernels.cu b/cuda/solver/multigrid_kernels.cu index 4eea02883b2..1d31130623a 100644 --- a/cuda/solver/multigrid_kernels.cu +++ b/cuda/solver/multigrid_kernels.cu @@ -11,9 +11,10 @@ #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" #include "core/base/array_access.hpp" #include "core/components/fill_array_kernels.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu index a8ee5f77cca..e1e01538f79 100644 --- a/cuda/solver/upper_trs_kernels.cu +++ b/cuda/solver/upper_trs_kernels.cu @@ -17,9 +17,9 @@ #include -#include "cuda/base/cusparse_bindings.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" #include "cuda/solver/common_trs_kernels.cuh" diff --git a/cuda/stop/criterion_kernels.cu b/cuda/stop/criterion_kernels.cu index 17bcbbc1567..e54b5d140f2 100644 --- a/cuda/stop/criterion_kernels.cu +++ b/cuda/stop/criterion_kernels.cu @@ -10,8 +10,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/stop/residual_norm_kernels.cu b/cuda/stop/residual_norm_kernels.cu index 18102d91ec5..7146d0cbf04 100644 --- a/cuda/stop/residual_norm_kernels.cu +++ b/cuda/stop/residual_norm_kernels.cu @@ -10,9 +10,9 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "core/base/array_access.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/test/base/math.cu b/cuda/test/base/math.cu index c7f70fe3011..944e7642223 100644 --- a/cuda/test/base/math.cu +++ b/cuda/test/base/math.cu @@ -17,8 +17,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" #include "cuda/test/utils.hpp" diff --git a/cuda/test/components/cooperative_groups.cu b/cuda/test/components/cooperative_groups.cu index 1b514842e84..c9d9e6bf124 100644 --- a/cuda/test/components/cooperative_groups.cu +++ b/cuda/test/components/cooperative_groups.cu @@ -2,9 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "cuda/components/cooperative_groups.cuh" - - #include @@ -15,7 +12,8 @@ #include -#include "cuda/base/config.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "cuda/test/utils.hpp" diff --git a/cuda/test/components/merging.cu b/cuda/test/components/merging.cu index 6ef7d3ab3c4..37b032eb794 100644 --- a/cuda/test/components/merging.cu +++ b/cuda/test/components/merging.cu @@ -18,7 +18,7 @@ #include -#include "cuda/components/cooperative_groups.cuh" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "cuda/test/utils.hpp" diff --git a/cuda/test/components/searching.cu b/cuda/test/components/searching.cu index 0eeb383c05c..ffe00c247c0 100644 --- a/cuda/test/components/searching.cu +++ b/cuda/test/components/searching.cu @@ -17,7 +17,7 @@ #include -#include "cuda/components/cooperative_groups.cuh" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "cuda/test/utils.hpp" diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 8c68efae046..035134ac4e1 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -93,7 +93,7 @@ string(REPLACE ";" "," GKO_DPCPP_JACOBI_BLOCK_SIZES_CODE "${GKO_DPCPP_JACOBI_BLO configure_file(preconditioner/jacobi_common.hpp.in preconditioner/jacobi_common.hpp) ginkgo_compile_features(ginkgo_dpcpp) -target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP _ONEDPL_COMPILE_KERNEL=0) +target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP GKO_DEVICE_NAMESPACE=dpcpp _ONEDPL_COMPILE_KERNEL=0) set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE) target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}") @@ -126,7 +126,7 @@ ginkgo_default_includes(ginkgo_dpcpp) ginkgo_install_library(ginkgo_dpcpp) if (GINKGO_CHECK_CIRCULAR_DEPS) - ginkgo_check_headers(ginkgo_dpcpp GKO_COMPILING_DPCPP) + ginkgo_check_headers(ginkgo_dpcpp "GKO_COMPILING_DPCPP;GKO_GKO_DEVICE_NAMESPACE=dpcpp") endif() if(GINKGO_BUILD_TESTS) diff --git a/dpcpp/test/base/CMakeLists.txt b/dpcpp/test/base/CMakeLists.txt index bb9c8a75050..38ecad08271 100644 --- a/dpcpp/test/base/CMakeLists.txt +++ b/dpcpp/test/base/CMakeLists.txt @@ -2,4 +2,4 @@ ginkgo_create_dpcpp_test(executor) ginkgo_create_dpcpp_test(dim3) ginkgo_create_dpcpp_test(kernel_launch) # set correct flags for kernel_launch.hpp -target_compile_definitions(dpcpp_test_base_kernel_launch PRIVATE GKO_COMPILING_DPCPP) +target_compile_definitions(dpcpp_test_base_kernel_launch PRIVATE GKO_COMPILING_DPCPP GKO_DEVICE_NAMESPACE=dpcpp) diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 046fd1e4d7a..de44eb20682 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -119,7 +119,7 @@ target_include_directories(ginkgo_hip PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/.. # for generated headers like jacobi_common.hip.hpp ) -target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP) +target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip) target_link_libraries(ginkgo_hip PUBLIC ginkgo_device) target_link_libraries(ginkgo_hip PRIVATE hip::host roc::hipblas roc::hipsparse hip::hiprand roc::rocrand) @@ -138,7 +138,7 @@ ginkgo_default_includes(ginkgo_hip) ginkgo_install_library(ginkgo_hip) if (GINKGO_CHECK_CIRCULAR_DEPS) - ginkgo_check_headers(ginkgo_hip GKO_COMPILING_HIP) + ginkgo_check_headers(ginkgo_hip "GKO_COMPILING_HIP;GKO_GKO_DEVICE_NAMESPACE=hip") endif() if(GINKGO_BUILD_TESTS) diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp index f5a1dba3977..74e6c34dc5d 100644 --- a/hip/base/batch_multi_vector_kernels.hip.cpp +++ b/hip/base/batch_multi_vector_kernels.hip.cpp @@ -5,7 +5,6 @@ #include "core/base/batch_multi_vector_kernels.hpp" -#include #include #include @@ -14,13 +13,14 @@ #include +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipblas_bindings.hip.hpp" -#include "hip/base/pointer_mode_guard.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp index fa44a22b554..4f09ec66bb8 100644 --- a/hip/base/batch_struct.hip.hpp +++ b/hip/base/batch_struct.hip.hpp @@ -10,9 +10,9 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" #include "core/base/batch_struct.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/types.hip.hpp" namespace gko { diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp index fbad841fd0f..3f531616489 100644 --- a/hip/base/config.hip.hpp +++ b/hip/base/config.hip.hpp @@ -6,15 +6,13 @@ #define GKO_HIP_BASE_CONFIG_HIP_HPP_ -#include - - #include #include +#include "common/cuda_hip/base/runtime.hpp" #include "hip/base/math.hip.hpp" diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp index 58376c2175b..be897510056 100644 --- a/hip/base/device.hip.cpp +++ b/hip/base/device.hip.cpp @@ -5,14 +5,12 @@ #include -#include - - #include #include #include +#include "common/cuda_hip/base/runtime.hpp" #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/base/device_matrix_data_kernels.hip.cpp b/hip/base/device_matrix_data_kernels.hip.cpp index 745ba955014..5a0b762ea57 100644 --- a/hip/base/device_matrix_data_kernels.hip.cpp +++ b/hip/base/device_matrix_data_kernels.hip.cpp @@ -14,8 +14,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" namespace gko { diff --git a/hip/base/exception.hip.cpp b/hip/base/exception.hip.cpp index aed5e803d60..3f569576c28 100644 --- a/hip/base/exception.hip.cpp +++ b/hip/base/exception.hip.cpp @@ -8,7 +8,7 @@ #include -#include +#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #include diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp index 2694ce4177f..4b5ce7afa7b 100644 --- a/hip/base/executor.hip.cpp +++ b/hip/base/executor.hip.cpp @@ -8,15 +8,13 @@ #include -#include - - #include #include #include -#include "hip/base/config.hip.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" #include "hip/base/hipblas_bindings.hip.hpp" #include "hip/base/hipsparse_bindings.hip.hpp" #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/base/hipblas_bindings.hip.hpp b/hip/base/hipblas_bindings.hip.hpp index f4dd3f1a1e8..725c7e20698 100644 --- a/hip/base/hipblas_bindings.hip.hpp +++ b/hip/base/hipblas_bindings.hip.hpp @@ -6,7 +6,7 @@ #define GKO_HIP_BASE_HIPBLAS_BINDINGS_HIP_HPP_ -#include +#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #else @@ -18,8 +18,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" namespace gko { @@ -260,6 +260,20 @@ inline void destroy_hipblas_handle(hipblasContext* handle) } // namespace hipblas + + +namespace blas { + + +using namespace hipblas; + + +#define BLAS_OP_N HIPBLAS_OP_N +#define BLAS_OP_T HIPBLAS_OP_T +#define BLAS_OP_C HIPBLAS_OP_C + + +} // namespace blas } // namespace hip } // namespace kernels } // namespace gko diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp index 471abb3ccd5..1dd772db250 100644 --- a/hip/base/hiprand_bindings.hip.hpp +++ b/hip/base/hiprand_bindings.hip.hpp @@ -6,7 +6,7 @@ #define GKO_HIP_BASE_HIPRAND_BINDINGS_HIP_HPP_ -#include +#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #else @@ -17,8 +17,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" namespace gko { @@ -90,6 +90,18 @@ GKO_BIND_HIPRAND_RANDOM_VECTOR(std::complex, } // namespace hiprand + + +namespace randlib { + + +using namespace hiprand; + + +#define RANDLIB_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT + + +} // namespace randlib } // namespace hip } // namespace kernels } // namespace gko diff --git a/hip/base/hipsparse_bindings.hip.hpp b/hip/base/hipsparse_bindings.hip.hpp index 62c7e60995e..997fc3d525f 100644 --- a/hip/base/hipsparse_bindings.hip.hpp +++ b/hip/base/hipsparse_bindings.hip.hpp @@ -6,7 +6,7 @@ #define GKO_HIP_BASE_HIPSPARSE_BINDINGS_HIP_HPP_ -#include +#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #else @@ -18,7 +18,7 @@ #include -#include "hip/base/types.hip.hpp" +#include "common/cuda_hip/base/types.hpp" namespace gko { @@ -955,6 +955,20 @@ GKO_BIND_HIPSPARSE_IC0(std::complex, hipsparseZcsric02); } // namespace hipsparse + + +namespace sparselib { + + +using namespace hipsparse; + + +#define SPARSELIB_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE +#define SPARSELIB_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE +#define SPARSELIB_SOLVE_POLICY_USE_LEVEL HIPSPARSE_SOLVE_POLICY_USE_LEVEL + + +} // namespace sparselib } // namespace hip } // namespace kernels } // namespace gko diff --git a/hip/base/hipsparse_block_bindings.hip.hpp b/hip/base/hipsparse_block_bindings.hip.hpp index eb9e8a31481..c69b0353f22 100644 --- a/hip/base/hipsparse_block_bindings.hip.hpp +++ b/hip/base/hipsparse_block_bindings.hip.hpp @@ -6,7 +6,7 @@ #define GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_ -#include +#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #else @@ -17,8 +17,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "hip/base/hipsparse_bindings.hip.hpp" -#include "hip/base/types.hip.hpp" namespace gko { diff --git a/hip/base/kernel_launch.hip.hpp b/hip/base/kernel_launch.hip.hpp index 1a00e99cac7..890b9922a4c 100644 --- a/hip/base/kernel_launch.hip.hpp +++ b/hip/base/kernel_launch.hip.hpp @@ -8,12 +8,12 @@ #endif -#include #include -#include "accessor/hip_helper.hpp" -#include "hip/base/types.hip.hpp" +#include "accessor/cuda_hip_helper.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" #include "hip/components/thread_ids.hip.hpp" @@ -24,21 +24,21 @@ namespace hip { template struct to_device_type_impl&> { - using type = std::decay_t>()))>; static type map_to_device(gko::acc::range& range) { - return gko::acc::as_hip_range(range); + return gko::acc::as_device_range(range); } }; template struct to_device_type_impl&> { - using type = std::decay_t>()))>; static type map_to_device(const gko::acc::range& range) { - return gko::acc::as_hip_range(range); + return gko::acc::as_device_range(range); } }; diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp index 7c5d0c01c9c..c32fb592de0 100644 --- a/hip/base/kernel_launch_reduction.hip.hpp +++ b/hip/base/kernel_launch_reduction.hip.hpp @@ -8,9 +8,9 @@ #endif +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/base/kernel_launch_solver.hip.hpp b/hip/base/kernel_launch_solver.hip.hpp index 18532c9754c..eda18f35eab 100644 --- a/hip/base/kernel_launch_solver.hip.hpp +++ b/hip/base/kernel_launch_solver.hip.hpp @@ -8,7 +8,7 @@ #endif -#include +#include "common/cuda_hip/base/runtime.hpp" namespace gko { diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp index 0e14bf9f511..5fde8f518c6 100644 --- a/hip/base/memory.hip.cpp +++ b/hip/base/memory.hip.cpp @@ -5,12 +5,10 @@ #include -#include - - #include +#include "common/cuda_hip/base/runtime.hpp" #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/base/pointer_mode_guard.hip.hpp b/hip/base/pointer_mode_guard.hip.hpp index 2908164cccd..2c980b113a7 100644 --- a/hip/base/pointer_mode_guard.hip.hpp +++ b/hip/base/pointer_mode_guard.hip.hpp @@ -9,7 +9,7 @@ #include -#include +#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #include diff --git a/hip/base/roctx.hip.cpp b/hip/base/roctx.hip.cpp index 0ed12a54786..46dad3be816 100644 --- a/hip/base/roctx.hip.cpp +++ b/hip/base/roctx.hip.cpp @@ -2,10 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include -#include +#include "common/cuda_hip/base/runtime.hpp" #if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX diff --git a/hip/base/scoped_device_id.hip.cpp b/hip/base/scoped_device_id.hip.cpp index ab6ed703da8..1fd7211b106 100644 --- a/hip/base/scoped_device_id.hip.cpp +++ b/hip/base/scoped_device_id.hip.cpp @@ -6,12 +6,10 @@ #include -#include - - #include +#include "common/cuda_hip/base/runtime.hpp" #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp index 93c1fc008d9..b56c5104428 100644 --- a/hip/base/stream.hip.cpp +++ b/hip/base/stream.hip.cpp @@ -5,14 +5,12 @@ #include -#include - - #include #include #include +#include "common/cuda_hip/base/runtime.hpp" #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/base/timer.hip.cpp b/hip/base/timer.hip.cpp index 44fe5b7cbeb..bd81d9f3be5 100644 --- a/hip/base/timer.hip.cpp +++ b/hip/base/timer.hip.cpp @@ -5,12 +5,10 @@ #include -#include - - #include +#include "common/cuda_hip/base/runtime.hpp" #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index 8827b2bea41..883e5812080 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -14,7 +14,9 @@ #include #include -#include + + +#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #else @@ -430,6 +432,10 @@ GKO_INLINE GKO_ATTRIBUTES constexpr } +using gpuComplex = hipComplex; +using gpuDoubleComplex = hipDoubleComplex; + + } // namespace hip } // namespace kernels } // namespace gko diff --git a/hip/components/atomic.hip.hpp b/hip/components/atomic.hip.hpp index f57705ff408..0dc8d7a3b46 100644 --- a/hip/components/atomic.hip.hpp +++ b/hip/components/atomic.hip.hpp @@ -9,8 +9,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" namespace gko { @@ -21,38 +21,6 @@ namespace hip { #include "common/cuda_hip/components/atomic.hpp.inc" -/** - * @internal - * - * @note It is not 'real' complex atomic add operation - */ -__forceinline__ __device__ thrust::complex atomic_add( - thrust::complex* __restrict__ address, thrust::complex val) -{ - hipComplex* addr = reinterpret_cast(address); - // Separate to real part and imag part - auto real = atomic_add(static_cast(&(addr->x)), val.real()); - auto imag = atomic_add(static_cast(&(addr->y)), val.imag()); - return {real, imag}; -} - - -/** - * @internal - * - * @note It is not 'real' complex atomic add operation - */ -__forceinline__ __device__ thrust::complex atomic_add( - thrust::complex* __restrict__ address, thrust::complex val) -{ - hipDoubleComplex* addr = reinterpret_cast(address); - // Separate to real part and imag part - auto real = atomic_add(static_cast(&(addr->x)), val.real()); - auto imag = atomic_add(static_cast(&(addr->y)), val.imag()); - return {real, imag}; -} - - } // namespace hip } // namespace kernels } // namespace gko diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp index 247218a1457..e81441a092b 100644 --- a/hip/components/cooperative_groups.hip.hpp +++ b/hip/components/cooperative_groups.hip.hpp @@ -9,8 +9,8 @@ #include -#include "hip/base/config.hip.hpp" -#include "hip/base/types.hip.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" namespace gko { diff --git a/hip/components/diagonal_block_manipulation.hip.hpp b/hip/components/diagonal_block_manipulation.hip.hpp index 0261c7549c5..290511e7583 100644 --- a/hip/components/diagonal_block_manipulation.hip.hpp +++ b/hip/components/diagonal_block_manipulation.hip.hpp @@ -9,9 +9,9 @@ #include -#include "hip/base/config.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" namespace gko { diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp index 59c0405a874..07daf486d84 100644 --- a/hip/components/format_conversion.hip.hpp +++ b/hip/components/format_conversion.hip.hpp @@ -6,14 +6,12 @@ #define GKO_HIP_COMPONENTS_FORMAT_CONVERSION_HIP_HPP_ -#include - - #include #include -#include "hip/components/cooperative_groups.hip.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/components/memory.hip.hpp b/hip/components/memory.hip.hpp index fd4fbb8ce11..4bb6fa19ec0 100644 --- a/hip/components/memory.hip.hpp +++ b/hip/components/memory.hip.hpp @@ -13,7 +13,7 @@ #include -#include "hip/base/types.hip.hpp" +#include "common/cuda_hip/base/types.hpp" namespace gko { diff --git a/hip/components/prefix_sum.hip.hpp b/hip/components/prefix_sum.hip.hpp index b5065589d8e..5acde03cbec 100644 --- a/hip/components/prefix_sum.hip.hpp +++ b/hip/components/prefix_sum.hip.hpp @@ -9,8 +9,8 @@ #include -#include "hip/base/hipblas_bindings.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/components/reduction.hip.hpp b/hip/components/reduction.hip.hpp index c8fa5e58b4f..fb0539952ff 100644 --- a/hip/components/reduction.hip.hpp +++ b/hip/components/reduction.hip.hpp @@ -9,16 +9,15 @@ #include -#include - - #include #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/array_access.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" @@ -57,7 +56,6 @@ __host__ ValueType reduce_add_array(std::shared_ptr exec, block_results.resize_and_reset(grid_dim); - reduce_add_array<<get_stream()>>>( size, as_device_type(source), diff --git a/hip/components/searching.hip.hpp b/hip/components/searching.hip.hpp index 2a6be767c2c..9222de9e1d6 100644 --- a/hip/components/searching.hip.hpp +++ b/hip/components/searching.hip.hpp @@ -6,7 +6,7 @@ #define GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_ -#include "hip/base/config.hip.hpp" +#include "common/cuda_hip/base/config.hpp" #include "hip/components/intrinsics.hip.hpp" diff --git a/hip/components/segment_scan.hip.hpp b/hip/components/segment_scan.hip.hpp index 7f98d08cf69..93ebb35833a 100644 --- a/hip/components/segment_scan.hip.hpp +++ b/hip/components/segment_scan.hip.hpp @@ -6,7 +6,7 @@ #define GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_ -#include "hip/components/cooperative_groups.hip.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" namespace gko { diff --git a/hip/components/sorting.hip.hpp b/hip/components/sorting.hip.hpp index 730c3c56401..4a664aee453 100644 --- a/hip/components/sorting.hip.hpp +++ b/hip/components/sorting.hip.hpp @@ -6,8 +6,8 @@ #define GKO_HIP_COMPONENTS_SORTING_HIP_HPP_ -#include "hip/base/config.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" namespace gko { diff --git a/hip/components/syncfree.hip.hpp b/hip/components/syncfree.hip.hpp index 9fe48944b56..7627a0a2781 100644 --- a/hip/components/syncfree.hip.hpp +++ b/hip/components/syncfree.hip.hpp @@ -9,11 +9,11 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "core/components/fill_array_kernels.hpp" -#include "hip/base/config.hip.hpp" #include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/memory.hip.hpp" namespace gko { diff --git a/hip/components/thread_ids.hip.hpp b/hip/components/thread_ids.hip.hpp index 03761983e02..6f0bd44ba9c 100644 --- a/hip/components/thread_ids.hip.hpp +++ b/hip/components/thread_ids.hip.hpp @@ -6,17 +6,12 @@ #define GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_ -#include "hip/base/config.hip.hpp" +#include "common/cuda_hip/base/config.hpp" namespace gko { namespace kernels { namespace hip { -/** - * @brief The HIP thread namespace. - * - * @ingroup hip_thread - */ namespace thread { diff --git a/hip/factorization/cholesky_kernels.hip.cpp b/hip/factorization/cholesky_kernels.hip.cpp index 1dd94bb05d0..419db21b811 100644 --- a/hip/factorization/cholesky_kernels.hip.cpp +++ b/hip/factorization/cholesky_kernels.hip.cpp @@ -20,15 +20,15 @@ #include +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" #include "core/factorization/elimination_forest.hpp" #include "core/factorization/lu_kernels.hpp" #include "core/matrix/csr_lookup.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" #include "hip/base/math.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/intrinsics.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/syncfree.hip.hpp" @@ -80,19 +80,19 @@ void symbolic_count(std::shared_ptr exec, } // sort postorder_cols inside rows { - const auto handle = exec->get_hipsparse_handle(); - auto descr = hipsparse::create_mat_descr(); + const auto handle = exec->get_sparselib_handle(); + auto descr = sparselib::create_mat_descr(); array permutation_array(exec, mtx_nnz); auto permutation = permutation_array.get_data(); components::fill_seq_array(exec, permutation, mtx_nnz); size_type buffer_size{}; - hipsparse::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz, + sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz, row_ptrs, postorder_cols, buffer_size); array buffer_array{exec, buffer_size}; auto buffer = buffer_array.get_data(); - hipsparse::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs, + sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs, postorder_cols, permutation, buffer); - hipsparse::destroy(descr); + sparselib::destroy(descr); } // count nonzeros per row of L { diff --git a/hip/factorization/factorization_kernels.hip.cpp b/hip/factorization/factorization_kernels.hip.cpp index a2de4912fdb..4080768bc07 100644 --- a/hip/factorization/factorization_kernels.hip.cpp +++ b/hip/factorization/factorization_kernels.hip.cpp @@ -5,17 +5,16 @@ #include "core/factorization/factorization_kernels.hpp" -#include - - #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/array_access.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/intrinsics.hip.hpp" #include "hip/components/searching.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/factorization/ic_kernels.hip.cpp b/hip/factorization/ic_kernels.hip.cpp index 7a845547d0d..edda974fd36 100644 --- a/hip/factorization/ic_kernels.hip.cpp +++ b/hip/factorization/ic_kernels.hip.cpp @@ -5,13 +5,11 @@ #include "core/factorization/ic_kernels.hpp" -#include - - #include -#include "hip/base/hipsparse_bindings.hip.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" namespace gko { @@ -30,32 +28,32 @@ void compute(std::shared_ptr exec, matrix::Csr* m) { const auto id = exec->get_device_id(); - auto handle = exec->get_hipsparse_handle(); - auto desc = hipsparse::create_mat_descr(); - auto info = hipsparse::create_ic0_info(); + auto handle = exec->get_sparselib_handle(); + auto desc = sparselib::create_mat_descr(); + auto info = sparselib::create_ic0_info(); // get buffer size for IC IndexType num_rows = m->get_size()[0]; IndexType nnz = m->get_num_stored_elements(); size_type buffer_size{}; - hipsparse::ic0_buffer_size(handle, num_rows, nnz, desc, + sparselib::ic0_buffer_size(handle, num_rows, nnz, desc, m->get_const_values(), m->get_const_row_ptrs(), m->get_const_col_idxs(), info, buffer_size); array buffer{exec, buffer_size}; // set up IC(0) - hipsparse::ic0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), + sparselib::ic0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), m->get_const_row_ptrs(), m->get_const_col_idxs(), - info, HIPSPARSE_SOLVE_POLICY_USE_LEVEL, + info, SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); - hipsparse::ic0(handle, num_rows, nnz, desc, m->get_values(), + sparselib::ic0(handle, num_rows, nnz, desc, m->get_values(), m->get_const_row_ptrs(), m->get_const_col_idxs(), info, - HIPSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); + SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); - hipsparse::destroy_ic0_info(info); - hipsparse::destroy(desc); + sparselib::destroy_ic0_info(info); + sparselib::destroy(desc); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL); diff --git a/hip/factorization/ilu_kernels.hip.cpp b/hip/factorization/ilu_kernels.hip.cpp index 071d3721536..f50df5ca75b 100644 --- a/hip/factorization/ilu_kernels.hip.cpp +++ b/hip/factorization/ilu_kernels.hip.cpp @@ -5,13 +5,11 @@ #include "core/factorization/ilu_kernels.hpp" -#include - - #include -#include "hip/base/hipsparse_bindings.hip.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" namespace gko { @@ -30,32 +28,32 @@ void compute_lu(std::shared_ptr exec, matrix::Csr* m) { const auto id = exec->get_device_id(); - auto handle = exec->get_hipsparse_handle(); - auto desc = hipsparse::create_mat_descr(); - auto info = hipsparse::create_ilu0_info(); + auto handle = exec->get_sparselib_handle(); + auto desc = sparselib::create_mat_descr(); + auto info = sparselib::create_ilu0_info(); // get buffer size for ILU IndexType num_rows = m->get_size()[0]; IndexType nnz = m->get_num_stored_elements(); size_type buffer_size{}; - hipsparse::ilu0_buffer_size(handle, num_rows, nnz, desc, + sparselib::ilu0_buffer_size(handle, num_rows, nnz, desc, m->get_const_values(), m->get_const_row_ptrs(), m->get_const_col_idxs(), info, buffer_size); array buffer{exec, buffer_size}; // set up ILU(0) - hipsparse::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), + sparselib::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), m->get_const_row_ptrs(), m->get_const_col_idxs(), - info, HIPSPARSE_SOLVE_POLICY_USE_LEVEL, + info, SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); - hipsparse::ilu0(handle, num_rows, nnz, desc, m->get_values(), + sparselib::ilu0(handle, num_rows, nnz, desc, m->get_values(), m->get_const_row_ptrs(), m->get_const_col_idxs(), info, - HIPSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); + SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); - hipsparse::destroy_ilu0_info(info); - hipsparse::destroy(desc); + sparselib::destroy_ilu0_info(info); + sparselib::destroy(desc); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/hip/factorization/lu_kernels.hip.cpp b/hip/factorization/lu_kernels.hip.cpp index e1c60103dd3..ec3e771134e 100644 --- a/hip/factorization/lu_kernels.hip.cpp +++ b/hip/factorization/lu_kernels.hip.cpp @@ -17,11 +17,11 @@ #include +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/allocator.hpp" #include "core/matrix/csr_lookup.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/syncfree.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/factorization/par_ic_kernels.hip.cpp b/hip/factorization/par_ic_kernels.hip.cpp index dd91ac27339..e4cd0b2470b 100644 --- a/hip/factorization/par_ic_kernels.hip.cpp +++ b/hip/factorization/par_ic_kernels.hip.cpp @@ -10,9 +10,9 @@ #include +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/memory.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp index 4b27383bff5..7f5dba82eba 100644 --- a/hip/factorization/par_ict_kernels.hip.cpp +++ b/hip/factorization/par_ict_kernels.hip.cpp @@ -5,9 +5,6 @@ #include "core/factorization/par_ict_kernels.hpp" -#include - - #include #include #include @@ -15,6 +12,8 @@ #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" @@ -22,7 +21,6 @@ #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/math.hip.hpp" #include "hip/components/intrinsics.hip.hpp" -#include "hip/components/memory.hip.hpp" #include "hip/components/merging.hip.hpp" #include "hip/components/prefix_sum.hip.hpp" #include "hip/components/reduction.hip.hpp" @@ -49,8 +47,7 @@ using compiled_kernels = syn::value_list; -#include "common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc" -#include "common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc" +#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc" namespace { diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp index b10941d44f1..fc05273bb09 100644 --- a/hip/factorization/par_ilu_kernels.hip.cpp +++ b/hip/factorization/par_ilu_kernels.hip.cpp @@ -5,16 +5,14 @@ #include "core/factorization/par_ilu_kernels.hpp" -#include - - #include #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/memory.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp similarity index 97% rename from hip/factorization/par_ilut_approx_filter_kernel.hip.cpp rename to hip/factorization/par_ilut_approx_filter_kernels.hip.cpp index d730e33e418..b5612ea29c6 100644 --- a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp +++ b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp @@ -8,9 +8,6 @@ #include -#include - - #include #include #include @@ -18,16 +15,17 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/intrinsics.hip.hpp" #include "hip/components/prefix_sum.hip.hpp" #include "hip/components/sorting.hip.hpp" diff --git a/hip/factorization/par_ilut_filter_kernel.hip.cpp b/hip/factorization/par_ilut_filter_kernels.hip.cpp similarity index 96% rename from hip/factorization/par_ilut_filter_kernel.hip.cpp rename to hip/factorization/par_ilut_filter_kernels.hip.cpp index eef1044878e..e6d0a6348cc 100644 --- a/hip/factorization/par_ilut_filter_kernel.hip.cpp +++ b/hip/factorization/par_ilut_filter_kernels.hip.cpp @@ -5,9 +5,6 @@ #include "core/factorization/par_ilut_kernels.hpp" -#include - - #include #include #include @@ -15,15 +12,16 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/intrinsics.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/hip/factorization/par_ilut_select_common.hip.cpp index 85c2eaa7036..ddad307dc62 100644 --- a/hip/factorization/par_ilut_select_common.hip.cpp +++ b/hip/factorization/par_ilut_select_common.hip.cpp @@ -4,7 +4,7 @@ // force-top: on // prevent compilation failure related to disappearing assert(...) statements -#include +#include "common/cuda_hip/base/runtime.hpp" // force-top: off diff --git a/hip/factorization/par_ilut_select_kernel.hip.cpp b/hip/factorization/par_ilut_select_kernels.hip.cpp similarity index 99% rename from hip/factorization/par_ilut_select_kernel.hip.cpp rename to hip/factorization/par_ilut_select_kernels.hip.cpp index b6d93e65b24..b259133b95d 100644 --- a/hip/factorization/par_ilut_select_kernel.hip.cpp +++ b/hip/factorization/par_ilut_select_kernels.hip.cpp @@ -8,14 +8,12 @@ #include -#include - - #include #include #include +#include "common/cuda_hip/base/runtime.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "hip/base/math.hip.hpp" #include "hip/components/atomic.hip.hpp" diff --git a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp similarity index 98% rename from hip/factorization/par_ilut_spgeam_kernel.hip.cpp rename to hip/factorization/par_ilut_spgeam_kernels.hip.cpp index ad102e49488..df77b1ba7a2 100644 --- a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp +++ b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp @@ -5,9 +5,6 @@ #include "core/factorization/par_ilut_kernels.hpp" -#include - - #include #include #include @@ -15,13 +12,14 @@ #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/math.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/intrinsics.hip.hpp" #include "hip/components/merging.hip.hpp" #include "hip/components/prefix_sum.hip.hpp" diff --git a/hip/factorization/par_ilut_sweep_kernel.hip.cpp b/hip/factorization/par_ilut_sweep_kernels.hip.cpp similarity index 97% rename from hip/factorization/par_ilut_sweep_kernel.hip.cpp rename to hip/factorization/par_ilut_sweep_kernels.hip.cpp index bdcecc609d5..0f1e6455812 100644 --- a/hip/factorization/par_ilut_sweep_kernel.hip.cpp +++ b/hip/factorization/par_ilut_sweep_kernels.hip.cpp @@ -5,9 +5,6 @@ #include "core/factorization/par_ilut_kernels.hpp" -#include - - #include #include #include @@ -15,6 +12,8 @@ #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" @@ -22,7 +21,6 @@ #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/math.hip.hpp" #include "hip/components/intrinsics.hip.hpp" -#include "hip/components/memory.hip.hpp" #include "hip/components/merging.hip.hpp" #include "hip/components/prefix_sum.hip.hpp" #include "hip/components/reduction.hip.hpp" @@ -85,7 +83,6 @@ void compute_l_u_factors(syn::value_list, } } - GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors, compute_l_u_factors); diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp index 432213f3083..de73576ffed 100644 --- a/hip/matrix/batch_csr_kernels.hip.cpp +++ b/hip/matrix/batch_csr_kernels.hip.cpp @@ -5,7 +5,6 @@ #include "core/matrix/batch_csr_kernels.hpp" -#include #include @@ -14,12 +13,13 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp index 0d03d4ea10b..5d3b9d8cef9 100644 --- a/hip/matrix/batch_dense_kernels.hip.cpp +++ b/hip/matrix/batch_dense_kernels.hip.cpp @@ -5,19 +5,21 @@ #include "core/matrix/batch_dense_kernels.hpp" -#include #include +#include #include +#include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp index 221a3ec65dd..d415f114c3b 100644 --- a/hip/matrix/batch_ell_kernels.hip.cpp +++ b/hip/matrix/batch_ell_kernels.hip.cpp @@ -5,7 +5,6 @@ #include "core/matrix/batch_ell_kernels.hpp" -#include #include @@ -14,12 +13,13 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp index 6c98146161e..16a267d95b6 100644 --- a/hip/matrix/batch_struct.hip.hpp +++ b/hip/matrix/batch_struct.hip.hpp @@ -13,8 +13,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "core/base/batch_struct.hpp" -#include "hip/base/types.hip.hpp" namespace gko { diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp index 5e32e1d8502..8f7a050ef87 100644 --- a/hip/matrix/coo_kernels.hip.cpp +++ b/hip/matrix/coo_kernels.hip.cpp @@ -5,9 +5,6 @@ #include "core/matrix/coo_kernels.hpp" -#include - - #include #include #include @@ -15,25 +12,21 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" #include "core/matrix/dense_kernels.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/format_conversion.hip.hpp" #include "hip/components/segment_scan.hip.hpp" #include "hip/components/thread_ids.hip.hpp" namespace gko { namespace kernels { -/** - * @brief The HIP namespace. - * - * @ingroup hip - */ namespace hip { /** * @brief The Coordinate matrix format namespace. diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp index 599a2df3669..31debd60a3d 100644 --- a/hip/matrix/csr_kernels.template.hip.cpp +++ b/hip/matrix/csr_kernels.template.hip.cpp @@ -8,7 +8,6 @@ #include -#include #include #include #include @@ -28,7 +27,13 @@ #include -#include "accessor/hip_helper.hpp" +#include "accessor/cuda_hip_helper.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/array_access.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" @@ -39,14 +44,9 @@ #include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/pointer_mode_guard.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/intrinsics.hip.hpp" #include "hip/components/merging.hip.hpp" #include "hip/components/prefix_sum.hip.hpp" @@ -133,10 +133,11 @@ void merge_path_spmv(syn::value_list, kernel::abstract_merge_path_spmv <<get_stream()>>>( static_cast(a->get_size()[0]), - acc::as_hip_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), as_device_type(a->get_const_srow()), - acc::as_hip_range(b_vals), acc::as_hip_range(c_vals), + acc::as_device_range(b_vals), + acc::as_device_range(c_vals), as_device_type(row_out.get_data()), as_device_type(val_out.get_data())); } @@ -144,7 +145,7 @@ void merge_path_spmv(syn::value_list, abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>( grid_num, as_device_type(val_out.get_data()), as_device_type(row_out.get_data()), - acc::as_hip_range(c_vals)); + acc::as_device_range(c_vals)); } else if (alpha != nullptr && beta != nullptr) { if (grid_num > 0) { @@ -152,12 +153,12 @@ void merge_path_spmv(syn::value_list, <<get_stream()>>>( static_cast(a->get_size()[0]), as_device_type(alpha->get_const_values()), - acc::as_hip_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), as_device_type(a->get_const_srow()), - acc::as_hip_range(b_vals), + acc::as_device_range(b_vals), as_device_type(beta->get_const_values()), - acc::as_hip_range(c_vals), + acc::as_device_range(c_vals), as_device_type(row_out.get_data()), as_device_type(val_out.get_data())); } @@ -166,7 +167,7 @@ void merge_path_spmv(syn::value_list, grid_num, as_device_type(val_out.get_data()), as_device_type(row_out.get_data()), as_device_type(alpha->get_const_values()), - acc::as_hip_range(c_vals)); + acc::as_device_range(c_vals)); } else { GKO_KERNEL_NOT_FOUND; } @@ -262,21 +263,21 @@ void classical_spmv(syn::value_list, if (grid.x > 0 && grid.y > 0) { kernel::abstract_classical_spmv <<get_stream()>>>( - a->get_size()[0], acc::as_hip_range(a_vals), + a->get_size()[0], acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), - acc::as_hip_range(b_vals), acc::as_hip_range(c_vals)); + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); } } else if (alpha != nullptr && beta != nullptr) { if (grid.x > 0 && grid.y > 0) { kernel::abstract_classical_spmv <<get_stream()>>>( a->get_size()[0], as_device_type(alpha->get_const_values()), - acc::as_hip_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), - acc::as_hip_range(b_vals), + acc::as_device_range(b_vals), as_device_type(beta->get_const_values()), - acc::as_hip_range(c_vals)); + acc::as_device_range(c_vals)); } } else { GKO_KERNEL_NOT_FOUND; @@ -318,20 +319,20 @@ void load_balance_spmv(std::shared_ptr exec, exec->get_stream()>>>( nwarps, static_cast(a->get_size()[0]), as_device_type(alpha->get_const_values()), - acc::as_hip_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), as_device_type(a->get_const_srow()), - acc::as_hip_range(b_vals), acc::as_hip_range(c_vals)); + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); } } else { if (csr_grid.x > 0 && csr_grid.y > 0) { kernel::abstract_spmv<<get_stream()>>>( nwarps, static_cast(a->get_size()[0]), - acc::as_hip_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), as_device_type(a->get_const_srow()), - acc::as_hip_range(b_vals), acc::as_hip_range(c_vals)); + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); } } } @@ -346,24 +347,24 @@ bool try_general_sparselib_spmv(std::shared_ptr exec, const ValueType* beta, matrix::Dense* c) { - bool try_sparselib = hipsparse::is_supported::value; + bool try_sparselib = sparselib::is_supported::value; try_sparselib = try_sparselib && b->get_stride() == 1 && c->get_stride() == 1; // rocSPARSE has issues with zero matrices try_sparselib = try_sparselib && a->get_num_stored_elements() > 0; if (try_sparselib) { - auto descr = hipsparse::create_mat_descr(); + auto descr = sparselib::create_mat_descr(); auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - hipsparse::spmv(exec->get_hipsparse_handle(), - HIPSPARSE_OPERATION_NON_TRANSPOSE, a->get_size()[0], + sparselib::spmv(exec->get_sparselib_handle(), + SPARSELIB_OPERATION_NON_TRANSPOSE, a->get_size()[0], a->get_size()[1], a->get_num_stored_elements(), alpha, descr, a->get_const_values(), row_ptrs, col_idxs, b->get_const_values(), beta, c->get_values()); - hipsparse::destroy(descr); + sparselib::destroy(descr); } return try_sparselib; } @@ -397,8 +398,8 @@ bool try_sparselib_spmv(std::shared_ptr exec, return try_general_sparselib_spmv(exec, alpha->get_const_values(), a, b, beta->get_const_values(), c); } else { - auto handle = exec->get_hipsparse_handle(); - hipsparse::pointer_mode_guard pm_guard(handle); + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); const auto valpha = one(); const auto vbeta = zero(); return try_general_sparselib_spmv(exec, &valpha, a, b, &vbeta, c); @@ -535,14 +536,14 @@ void spgemm(std::shared_ptr exec, const matrix::Csr* b, matrix::Csr* c) { - if (hipsparse::is_supported::value) { - auto handle = exec->get_hipsparse_handle(); - hipsparse::pointer_mode_guard pm_guard(handle); - auto a_descr = hipsparse::create_mat_descr(); - auto b_descr = hipsparse::create_mat_descr(); - auto c_descr = hipsparse::create_mat_descr(); - auto d_descr = hipsparse::create_mat_descr(); - auto info = hipsparse::create_spgemm_info(); + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); + auto a_descr = sparselib::create_mat_descr(); + auto b_descr = sparselib::create_mat_descr(); + auto c_descr = sparselib::create_mat_descr(); + auto d_descr = sparselib::create_mat_descr(); + auto info = sparselib::create_spgemm_info(); auto alpha = one(); auto a_nnz = static_cast(a->get_num_stored_elements()); @@ -566,7 +567,7 @@ void spgemm(std::shared_ptr exec, // allocate buffer size_type buffer_size{}; - hipsparse::spgemm_buffer_size( + sparselib::spgemm_buffer_size( handle, m, n, k, &alpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz, null_index, null_index, info, buffer_size); @@ -575,7 +576,7 @@ void spgemm(std::shared_ptr exec, // count nnz IndexType c_nnz{}; - hipsparse::spgemm_nnz( + sparselib::spgemm_nnz( handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, d_descr, zero_nnz, null_index, null_index, c_descr, c_row_ptrs, &c_nnz, info, buffer); @@ -585,17 +586,17 @@ void spgemm(std::shared_ptr exec, c_vals_array.resize_and_reset(c_nnz); auto c_col_idxs = c_col_idxs_array.get_data(); auto c_vals = c_vals_array.get_data(); - hipsparse::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals, + sparselib::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals, a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz, null_value, null_index, null_index, c_descr, c_vals, c_row_ptrs, c_col_idxs, info, buffer); - hipsparse::destroy_spgemm_info(info); - hipsparse::destroy(d_descr); - hipsparse::destroy(c_descr); - hipsparse::destroy(b_descr); - hipsparse::destroy(a_descr); + sparselib::destroy_spgemm_info(info); + sparselib::destroy(d_descr); + sparselib::destroy(c_descr); + sparselib::destroy(b_descr); + sparselib::destroy(a_descr); } else { GKO_NOT_IMPLEMENTED; } @@ -611,14 +612,14 @@ void advanced_spgemm(std::shared_ptr exec, const matrix::Csr* d, matrix::Csr* c) { - if (hipsparse::is_supported::value) { - auto handle = exec->get_hipsparse_handle(); - hipsparse::pointer_mode_guard pm_guard(handle); - auto a_descr = hipsparse::create_mat_descr(); - auto b_descr = hipsparse::create_mat_descr(); - auto c_descr = hipsparse::create_mat_descr(); - auto d_descr = hipsparse::create_mat_descr(); - auto info = hipsparse::create_spgemm_info(); + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); + auto a_descr = sparselib::create_mat_descr(); + auto b_descr = sparselib::create_mat_descr(); + auto c_descr = sparselib::create_mat_descr(); + auto d_descr = sparselib::create_mat_descr(); + auto info = sparselib::create_spgemm_info(); auto a_nnz = static_cast(a->get_num_stored_elements()); auto a_vals = a->get_const_values(); @@ -640,7 +641,7 @@ void advanced_spgemm(std::shared_ptr exec, // allocate buffer size_type buffer_size{}; - hipsparse::spgemm_buffer_size( + sparselib::spgemm_buffer_size( handle, m, n, k, &one_value, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr, IndexType{}, null_index, null_index, info, buffer_size); @@ -651,7 +652,7 @@ void advanced_spgemm(std::shared_ptr exec, array c_tmp_row_ptrs_array(exec, m + 1); auto c_tmp_row_ptrs = c_tmp_row_ptrs_array.get_data(); IndexType c_nnz{}; - hipsparse::spgemm_nnz( + sparselib::spgemm_nnz( handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, d_descr, IndexType{}, null_index, null_index, c_descr, c_tmp_row_ptrs, &c_nnz, info, buffer); @@ -661,7 +662,7 @@ void advanced_spgemm(std::shared_ptr exec, array c_tmp_vals_array(exec, c_nnz); auto c_tmp_col_idxs = c_tmp_col_idxs_array.get_data(); auto c_tmp_vals = c_tmp_vals_array.get_data(); - hipsparse::spgemm(handle, m, n, k, &one_value, a_descr, a_nnz, a_vals, + sparselib::spgemm(handle, m, n, k, &one_value, a_descr, a_nnz, a_vals, a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, b_row_ptrs, b_col_idxs, null_value, d_descr, IndexType{}, null_value, null_index, null_index, @@ -669,11 +670,11 @@ void advanced_spgemm(std::shared_ptr exec, info, buffer); // destroy hipsparse context - hipsparse::destroy_spgemm_info(info); - hipsparse::destroy(d_descr); - hipsparse::destroy(c_descr); - hipsparse::destroy(b_descr); - hipsparse::destroy(a_descr); + sparselib::destroy_spgemm_info(info); + sparselib::destroy(d_descr); + sparselib::destroy(c_descr); + sparselib::destroy(b_descr); + sparselib::destroy(a_descr); auto total_nnz = c_nnz + d->get_num_stored_elements(); auto nnz_per_row = total_nnz / m; @@ -701,12 +702,12 @@ void transpose(std::shared_ptr exec, if (orig->get_size()[0] == 0) { return; } - if (hipsparse::is_supported::value) { + if (sparselib::is_supported::value) { hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC; hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO; - hipsparse::transpose( - exec->get_hipsparse_handle(), orig->get_size()[0], + sparselib::transpose( + exec->get_sparselib_handle(), orig->get_size()[0], orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), @@ -728,12 +729,12 @@ void conj_transpose(std::shared_ptr exec, const auto block_size = default_block_size; const auto grid_size = ceildiv(trans->get_num_stored_elements(), block_size); - if (hipsparse::is_supported::value) { + if (sparselib::is_supported::value) { hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC; hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO; - hipsparse::transpose( - exec->get_hipsparse_handle(), orig->get_size()[0], + sparselib::transpose( + exec->get_sparselib_handle(), orig->get_size()[0], orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), @@ -753,9 +754,9 @@ template void sort_by_column_index(std::shared_ptr exec, matrix::Csr* to_sort) { - if (hipsparse::is_supported::value) { - auto handle = exec->get_hipsparse_handle(); - auto descr = hipsparse::create_mat_descr(); + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + auto descr = sparselib::create_mat_descr(); auto m = IndexType(to_sort->get_size()[0]); auto n = IndexType(to_sort->get_size()[1]); auto nnz = IndexType(to_sort->get_num_stored_elements()); @@ -771,23 +772,23 @@ void sort_by_column_index(std::shared_ptr exec, // init identity permutation array permutation_array(exec, nnz); auto permutation = permutation_array.get_data(); - hipsparse::create_identity_permutation(handle, nnz, permutation); + sparselib::create_identity_permutation(handle, nnz, permutation); // allocate buffer size_type buffer_size{}; - hipsparse::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs, + sparselib::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs, buffer_size); array buffer_array{exec, buffer_size}; auto buffer = buffer_array.get_data(); // sort column indices - hipsparse::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs, + sparselib::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs, permutation, buffer); // sort values - hipsparse::gather(handle, nnz, tmp_vals, vals, permutation); + sparselib::gather(handle, nnz, tmp_vals, vals, permutation); - hipsparse::destroy(descr); + sparselib::destroy(descr); } else { fallback_sort(exec, to_sort); } diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp index 36e581049e0..8fed3c97c1b 100644 --- a/hip/matrix/dense_kernels.hip.cpp +++ b/hip/matrix/dense_kernels.hip.cpp @@ -5,9 +5,6 @@ #include "core/matrix/dense_kernels.hpp" -#include - - #include #include #include @@ -20,12 +17,13 @@ #include +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipblas_bindings.hip.hpp" -#include "hip/base/pointer_mode_guard.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/intrinsics.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" @@ -56,11 +54,11 @@ void compute_dot_dispatch(std::shared_ptr exec, matrix::Dense* result, array& tmp) { if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (hipblas::is_supported::value) { - auto handle = exec->get_hipblas_handle(); - hipblas::dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), - y->get_stride(), result->get_values()); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + blas::dot(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), y->get_const_values(), y->get_stride(), + result->get_values()); } else { compute_dot(exec, x, y, result, tmp); } @@ -81,11 +79,11 @@ void compute_conj_dot_dispatch(std::shared_ptr exec, array& tmp) { if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (hipblas::is_supported::value) { - auto handle = exec->get_hipblas_handle(); - hipblas::conj_dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), - y->get_stride(), result->get_values()); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + blas::conj_dot(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), y->get_const_values(), + y->get_stride(), result->get_values()); } else { compute_conj_dot(exec, x, y, result, tmp); } @@ -105,10 +103,10 @@ void compute_norm2_dispatch(std::shared_ptr exec, array& tmp) { if (x->get_size()[1] == 1) { - if (hipblas::is_supported::value) { - auto handle = exec->get_hipblas_handle(); - hipblas::norm2(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), result->get_values()); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + blas::norm2(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), result->get_values()); } else { compute_norm2(exec, x, result, tmp); } @@ -127,19 +125,18 @@ void simple_apply(std::shared_ptr exec, const matrix::Dense* b, matrix::Dense* c) { - if (hipblas::is_supported::value) { - auto handle = exec->get_hipblas_handle(); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { if (a->get_size()[1] > 0) { - hipblas::pointer_mode_guard pm_guard(handle); + blas::pointer_mode_guard pm_guard(handle); auto alpha = one(); auto beta = zero(); - hipblas::gemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, - c->get_size()[1], c->get_size()[0], - a->get_size()[1], &alpha, b->get_const_values(), - b->get_stride(), a->get_const_values(), - a->get_stride(), &beta, c->get_values(), - c->get_stride()); + blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1], + c->get_size()[0], a->get_size()[1], &alpha, + b->get_const_values(), b->get_stride(), + a->get_const_values(), a->get_stride(), &beta, + c->get_values(), c->get_stride()); } else { dense::fill(exec, c, zero()); } @@ -158,15 +155,15 @@ void apply(std::shared_ptr exec, const matrix::Dense* a, const matrix::Dense* b, const matrix::Dense* beta, matrix::Dense* c) { - if (hipblas::is_supported::value) { + if (blas::is_supported::value) { if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { if (a->get_size()[1] > 0) { - hipblas::gemm( - exec->get_hipblas_handle(), HIPBLAS_OP_N, HIPBLAS_OP_N, - c->get_size()[1], c->get_size()[0], a->get_size()[1], - alpha->get_const_values(), b->get_const_values(), - b->get_stride(), a->get_const_values(), a->get_stride(), - beta->get_const_values(), c->get_values(), c->get_stride()); + blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N, + c->get_size()[1], c->get_size()[0], a->get_size()[1], + alpha->get_const_values(), b->get_const_values(), + b->get_stride(), a->get_const_values(), + a->get_stride(), beta->get_const_values(), + c->get_values(), c->get_stride()); } else { dense::scale(exec, beta, c); } @@ -184,17 +181,17 @@ void transpose(std::shared_ptr exec, const matrix::Dense* orig, matrix::Dense* trans) { - if (hipblas::is_supported::value) { - auto handle = exec->get_hipblas_handle(); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { - hipblas::pointer_mode_guard pm_guard(handle); + blas::pointer_mode_guard pm_guard(handle); auto alpha = one(); auto beta = zero(); - hipblas::geam(handle, HIPBLAS_OP_T, HIPBLAS_OP_N, - orig->get_size()[0], orig->get_size()[1], &alpha, - orig->get_const_values(), orig->get_stride(), &beta, - trans->get_const_values(), trans->get_stride(), - trans->get_values(), trans->get_stride()); + blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0], + orig->get_size()[1], &alpha, orig->get_const_values(), + orig->get_stride(), &beta, trans->get_const_values(), + trans->get_stride(), trans->get_values(), + trans->get_stride()); } } else { GKO_NOT_IMPLEMENTED; @@ -209,17 +206,17 @@ void conj_transpose(std::shared_ptr exec, const matrix::Dense* orig, matrix::Dense* trans) { - if (hipblas::is_supported::value) { - auto handle = exec->get_hipblas_handle(); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { - hipblas::pointer_mode_guard pm_guard(handle); + blas::pointer_mode_guard pm_guard(handle); auto alpha = one(); auto beta = zero(); - hipblas::geam(handle, HIPBLAS_OP_C, HIPBLAS_OP_N, - orig->get_size()[0], orig->get_size()[1], &alpha, - orig->get_const_values(), orig->get_stride(), &beta, - trans->get_values(), trans->get_stride(), - trans->get_values(), trans->get_stride()); + blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0], + orig->get_size()[1], &alpha, orig->get_const_values(), + orig->get_stride(), &beta, trans->get_const_values(), + trans->get_stride(), trans->get_values(), + trans->get_stride()); } } else { GKO_NOT_IMPLEMENTED; diff --git a/hip/matrix/diagonal_kernels.hip.cpp b/hip/matrix/diagonal_kernels.hip.cpp index deedb9543ec..01033004c6b 100644 --- a/hip/matrix/diagonal_kernels.hip.cpp +++ b/hip/matrix/diagonal_kernels.hip.cpp @@ -5,16 +5,14 @@ #include "core/matrix/diagonal_kernels.hpp" -#include - - #include #include -#include "hip/base/hipsparse_bindings.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index 51c34430f5c..4f1ff6a3539 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -8,9 +8,6 @@ #include -#include - - #include #include #include @@ -18,19 +15,20 @@ #include -#include "accessor/hip_helper.hpp" +#include "accessor/cuda_hip_helper.hpp" #include "accessor/reduced_row_major.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/format_conversion.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" @@ -133,20 +131,21 @@ void abstract_spmv(syn::value_list, if (grid_size.x > 0 && grid_size.y > 0) { kernel::spmv <<get_stream()>>>( - nrows, num_worker_per_row, acc::as_hip_range(a_vals), + nrows, num_worker_per_row, acc::as_device_range(a_vals), a->get_const_col_idxs(), stride, - num_stored_elements_per_row, acc::as_hip_range(b_vals), + num_stored_elements_per_row, acc::as_device_range(b_vals), as_device_type(c->get_values()), c->get_stride()); } } else if (alpha != nullptr && beta != nullptr) { + const auto alpha_val = acc::range( + std::array{1}, alpha->get_const_values()); if (grid_size.x > 0 && grid_size.y > 0) { - const auto alpha_val = acc::range( - std::array{1}, alpha->get_const_values()); kernel::spmv <<get_stream()>>>( - nrows, num_worker_per_row, acc::as_hip_range(alpha_val), - acc::as_hip_range(a_vals), a->get_const_col_idxs(), stride, - num_stored_elements_per_row, acc::as_hip_range(b_vals), + nrows, num_worker_per_row, acc::as_device_range(alpha_val), + acc::as_device_range(a_vals), a->get_const_col_idxs(), + stride, num_stored_elements_per_row, + acc::as_device_range(b_vals), as_device_type(beta->get_const_values()), as_device_type(c->get_values()), c->get_stride()); } @@ -215,7 +214,7 @@ void spmv(std::shared_ptr exec, const int num_worker_per_row = std::get<2>(data); /** - * info is the parameter for selecting the hip kernel. + * info is the parameter for selecting the device kernel. * for info == 0, it uses the kernel by warp_size threads with atomic * operation for other value, it uses the kernel without atomic_add */ @@ -249,7 +248,7 @@ void advanced_spmv(std::shared_ptr exec, const int num_worker_per_row = std::get<2>(data); /** - * info is the parameter for selecting the hip kernel. + * info is the parameter for selecting the device kernel. * for info == 0, it uses the kernel by warp_size threads with atomic * operation for other value, it uses the kernel without atomic_add */ diff --git a/hip/matrix/fbcsr_kernels.template.hip.cpp b/hip/matrix/fbcsr_kernels.template.hip.cpp index b84e7644e80..0286aff0bba 100644 --- a/hip/matrix/fbcsr_kernels.template.hip.cpp +++ b/hip/matrix/fbcsr_kernels.template.hip.cpp @@ -8,7 +8,6 @@ #include -#include #include #include #include @@ -25,6 +24,13 @@ #include +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/unified/base/kernel_launch.hpp" #include "core/base/array_access.hpp" #include "core/base/block_sizes.hpp" @@ -34,22 +40,17 @@ #include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipblas_bindings.hip.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" #include "hip/base/hipsparse_block_bindings.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/pointer_mode_guard.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/merging.hip.hpp" #include "hip/components/prefix_sum.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" + namespace gko { namespace kernels { namespace hip { @@ -82,15 +83,15 @@ void dense_transpose(std::shared_ptr exec, if (nrows == 0) { return; } - if (hipblas::is_supported::value) { - auto handle = exec->get_hipblas_handle(); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); { - hipblas::pointer_mode_guard pm_guard(handle); + blas::pointer_mode_guard pm_guard(handle); auto alpha = one(); auto beta = zero(); - hipblas::geam(handle, HIPBLAS_OP_T, HIPBLAS_OP_N, nrows, ncols, - &alpha, orig, orig_stride, &beta, trans, trans_stride, - trans, trans_stride); + blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig, + orig_stride, &beta, trans, trans_stride, trans, + trans_stride); } } else { GKO_NOT_IMPLEMENTED; @@ -116,12 +117,12 @@ void spmv(std::shared_ptr exec, dense::fill(exec, c, zero()); return; } - if (hipsparse::is_supported::value) { - auto handle = exec->get_hipsparse_handle(); - hipsparse::pointer_mode_guard pm_guard(handle); + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); const auto alpha = one(); const auto beta = zero(); - auto descr = hipsparse::create_mat_descr(); + auto descr = sparselib::create_mat_descr(); const auto row_ptrs = a->get_const_row_ptrs(); const auto col_idxs = a->get_const_col_idxs(); const auto values = a->get_const_values(); @@ -135,21 +136,21 @@ void spmv(std::shared_ptr exec, const auto in_stride = b->get_stride(); const auto out_stride = c->get_stride(); if (nrhs == 1 && in_stride == 1 && out_stride == 1) { - hipsparse::bsrmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, + sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, nnzb, &alpha, descr, values, row_ptrs, col_idxs, bs, b->get_const_values(), &beta, c->get_values()); } else { const auto trans_stride = nrows; auto trans_c = array(exec, nrows * nrhs); - hipsparse::bsrmm(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, - HIPSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, + sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, &alpha, descr, values, row_ptrs, col_idxs, bs, b->get_const_values(), in_stride, &beta, trans_c.get_data(), trans_stride); dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), out_stride, c->get_values()); } - hipsparse::destroy(descr); + sparselib::destroy(descr); } else { GKO_NOT_IMPLEMENTED; } @@ -173,11 +174,11 @@ void advanced_spmv(std::shared_ptr exec, dense::scale(exec, beta, c); return; } - if (hipsparse::is_supported::value) { - auto handle = exec->get_hipsparse_handle(); + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); const auto alphp = alpha->get_const_values(); const auto betap = beta->get_const_values(); - auto descr = hipsparse::create_mat_descr(); + auto descr = sparselib::create_mat_descr(); const auto row_ptrs = a->get_const_row_ptrs(); const auto col_idxs = a->get_const_col_idxs(); const auto values = a->get_const_values(); @@ -191,7 +192,7 @@ void advanced_spmv(std::shared_ptr exec, const auto in_stride = b->get_stride(); const auto out_stride = c->get_stride(); if (nrhs == 1 && in_stride == 1 && out_stride == 1) { - hipsparse::bsrmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, + sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, b->get_const_values(), betap, c->get_values()); } else { @@ -199,27 +200,83 @@ void advanced_spmv(std::shared_ptr exec, auto trans_c = array(exec, nrows * nrhs); dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(), trans_stride, trans_c.get_data()); - hipsparse::bsrmm(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, - HIPSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, + sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, b->get_const_values(), in_stride, betap, trans_c.get_data(), trans_stride); dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), out_stride, c->get_values()); } - hipsparse::destroy(descr); + sparselib::destroy(descr); } else { GKO_NOT_IMPLEMENTED; } } +namespace { + + +template +void transpose_blocks_impl(syn::value_list, + std::shared_ptr exec, + matrix::Fbcsr* const mat) +{ + constexpr int subwarp_size = config::warp_size; + const auto nbnz = mat->get_num_stored_blocks(); + const auto numthreads = nbnz * subwarp_size; + const auto block_size = default_block_size; + const auto grid_dim = ceildiv(numthreads, block_size); + if (grid_dim > 0) { + kernel::transpose_blocks + <<get_stream()>>>( + nbnz, mat->get_values()); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks, + transpose_blocks_impl); + + +} // namespace + + template void transpose(const std::shared_ptr exec, - const matrix::Fbcsr* const input, - matrix::Fbcsr* const output) + const matrix::Fbcsr* const orig, + matrix::Fbcsr* const trans) { - fallback_transpose(exec, input, output); +#ifdef GKO_COMPILING_CUDA + if (sparselib::is_supported::value) { + const int bs = orig->get_block_size(); + const IndexType nnzb = + static_cast(orig->get_num_stored_blocks()); + cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; + cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; + const IndexType buffer_size = sparselib::bsr_transpose_buffersize( + exec->get_sparselib_handle(), orig->get_num_block_rows(), + orig->get_num_block_cols(), nnzb, orig->get_const_values(), + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs); + array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + sparselib::bsr_transpose( + exec->get_sparselib_handle(), orig->get_num_block_rows(), + orig->get_num_block_cols(), nnzb, orig->get_const_values(), + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs, + trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(), + copyValues, idxBase, buffer); + + // transpose blocks + select_transpose_blocks( + fixedblock::compiled_kernels(), + [bs](int compiled_block_size) { return bs == compiled_block_size; }, + syn::value_list(), syn::type_list<>(), exec, trans); + } else +#endif + { + fallback_transpose(exec, orig, trans); + } } diff --git a/hip/matrix/fft_kernels.hip.cpp b/hip/matrix/fft_kernels.hip.cpp index dc397b20892..92358d732c7 100644 --- a/hip/matrix/fft_kernels.hip.cpp +++ b/hip/matrix/fft_kernels.hip.cpp @@ -8,7 +8,7 @@ #include -#include +#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #else diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp index 8028dd0777f..f1e15c946e0 100644 --- a/hip/matrix/sellp_kernels.hip.cpp +++ b/hip/matrix/sellp_kernels.hip.cpp @@ -5,9 +5,6 @@ #include "core/matrix/sellp_kernels.hpp" -#include - - #include #include #include @@ -15,10 +12,11 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" #include "core/components/prefix_sum_kernels.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/hip/matrix/sparsity_csr_kernels.hip.cpp index e5a6900cdfe..487b134d28a 100644 --- a/hip/matrix/sparsity_csr_kernels.hip.cpp +++ b/hip/matrix/sparsity_csr_kernels.hip.cpp @@ -5,25 +5,25 @@ #include "core/matrix/sparsity_csr_kernels.hpp" -#include #include #include -#include "accessor/hip_helper.hpp" +#include "accessor/cuda_hip_helper.hpp" #include "accessor/reduced_row_major.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" #include "hip/base/math.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" @@ -42,7 +42,11 @@ namespace sparsity_csr { constexpr int classical_oversubscription = 32; constexpr int default_block_size = 512; +#ifdef GKO_COMPILING_HIP constexpr int spmv_block_size = 256; +#else +constexpr int spmv_block_size = 128; +#endif constexpr int warps_in_block = 4; @@ -106,16 +110,16 @@ void classical_spmv(syn::value_list, a->get_size()[0], as_device_type(a->get_const_value()), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), - acc::as_hip_range(b_vals), acc::as_hip_range(c_vals)); + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); } else if (alpha != nullptr && beta != nullptr) { kernel::abstract_classical_spmv <<get_stream()>>>( a->get_size()[0], as_device_type(alpha->get_const_values()), as_device_type(a->get_const_value()), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), - acc::as_hip_range(b_vals), + acc::as_device_range(b_vals), as_device_type(beta->get_const_values()), - acc::as_hip_range(c_vals)); + acc::as_device_range(c_vals)); } else { GKO_KERNEL_NOT_FOUND; } @@ -169,21 +173,21 @@ void sort_by_column_index(std::shared_ptr exec, const auto num_cols = static_cast(to_sort->get_size()[1]); const auto row_ptrs = to_sort->get_const_row_ptrs(); const auto col_idxs = to_sort->get_col_idxs(); - if (hipsparse::is_supported::value) { - const auto handle = exec->get_hipsparse_handle(); - auto descr = hipsparse::create_mat_descr(); + if (sparselib::is_supported::value) { + const auto handle = exec->get_sparselib_handle(); + auto descr = sparselib::create_mat_descr(); array permutation_array(exec, to_sort->get_num_nonzeros()); auto permutation = permutation_array.get_data(); components::fill_seq_array(exec, permutation, to_sort->get_num_nonzeros()); size_type buffer_size{}; - hipsparse::csrsort_buffer_size(handle, num_rows, num_cols, nnz, + sparselib::csrsort_buffer_size(handle, num_rows, num_cols, nnz, row_ptrs, col_idxs, buffer_size); array buffer_array{exec, buffer_size}; auto buffer = buffer_array.get_data(); - hipsparse::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs, + sparselib::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs, col_idxs, permutation, buffer); - hipsparse::destroy(descr); + sparselib::destroy(descr); } else { fallback_sort(exec, to_sort); } diff --git a/hip/multigrid/pgm_kernels.hip.cpp b/hip/multigrid/pgm_kernels.hip.cpp index ed81d1c66dc..18c1f0957c4 100644 --- a/hip/multigrid/pgm_kernels.hip.cpp +++ b/hip/multigrid/pgm_kernels.hip.cpp @@ -19,8 +19,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" namespace gko { diff --git a/hip/preconditioner/batch_preconditioners.hip.hpp b/hip/preconditioner/batch_preconditioners.hip.hpp index 6d58244a41a..f3969c16b81 100644 --- a/hip/preconditioner/batch_preconditioners.hip.hpp +++ b/hip/preconditioner/batch_preconditioners.hip.hpp @@ -6,9 +6,9 @@ #define GKO_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HIP_HPP_ +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/matrix/batch_struct.hpp" #include "core/preconditioner/batch_jacobi_helpers.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" diff --git a/hip/preconditioner/isai_kernels.hip.cpp b/hip/preconditioner/isai_kernels.hip.cpp index 7339bd0a754..4eaf65cc438 100644 --- a/hip/preconditioner/isai_kernels.hip.cpp +++ b/hip/preconditioner/isai_kernels.hip.cpp @@ -5,21 +5,18 @@ #include "core/preconditioner/isai_kernels.hpp" -#include - - #include #include #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/merging.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp index 326b9f6b720..67a65385ca4 100644 --- a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp @@ -5,20 +5,18 @@ #include "core/preconditioner/jacobi_kernels.hpp" -#include - - #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/warp_blas.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" @@ -35,7 +33,7 @@ namespace hip { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc" +#include // clang-format off diff --git a/hip/preconditioner/jacobi_common.hip.hpp.in b/hip/preconditioner/jacobi_common.hip.hpp.in index 6e9c279a46f..2185e124db6 100644 --- a/hip/preconditioner/jacobi_common.hip.hpp.in +++ b/hip/preconditioner/jacobi_common.hip.hpp.in @@ -6,7 +6,7 @@ #include -#include "hip/base/config.hip.hpp" +#include "common/cuda_hip/base/config.hpp" namespace gko { diff --git a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp index 86a3b799590..d95a97d7068 100644 --- a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp @@ -9,14 +9,14 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/diagonal_block_manipulation.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" @@ -35,7 +35,7 @@ namespace hip { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc" +#include // clang-format off diff --git a/hip/preconditioner/jacobi_generate_kernel.hip.cpp b/hip/preconditioner/jacobi_generate_kernel.hip.cpp index 713be193250..50bf72ea964 100644 --- a/hip/preconditioner/jacobi_generate_kernel.hip.cpp +++ b/hip/preconditioner/jacobi_generate_kernel.hip.cpp @@ -5,21 +5,19 @@ #include "core/preconditioner/jacobi_kernels.hpp" -#include - - #include #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/diagonal_block_manipulation.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" @@ -38,7 +36,7 @@ namespace hip { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc" +#include template - - #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" @@ -34,9 +32,9 @@ namespace jacobi { // a total of 32/16 warps (1024 threads) -#if GINKGO_HIP_PLATFORM_HCC +#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC constexpr int default_num_warps = 16; -#else // GINKGO_HIP_PLATFORM_NVCC +#else // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC constexpr int default_num_warps = 32; #endif // with current architectures, at most 32 warps can be scheduled per SM (and diff --git a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp index be485af5730..b3e6e6fe73b 100644 --- a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp @@ -8,14 +8,14 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/warp_blas.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" @@ -32,7 +32,7 @@ namespace hip { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc" +#include // clang-format off diff --git a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp index 0763e986d41..e8e247210ec 100644 --- a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp @@ -5,20 +5,18 @@ #include "core/preconditioner/jacobi_kernels.hpp" -#include - - #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/warp_blas.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" @@ -35,7 +33,7 @@ namespace hip { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc" +#include template +#include "common/cuda_hip/components/memory.hpp" #include "core/base/array_access.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/components/memory.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index c62c11405a5..fdeb0580931 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -5,7 +5,6 @@ #include "core/solver/batch_bicgstab_kernels.hpp" -#include #include #include @@ -14,15 +13,16 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index d61eead6fab..47c2bc498eb 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -5,7 +5,6 @@ #include "core/solver/batch_cg_kernels.hpp" -#include #include #include @@ -14,15 +13,16 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp index 794ac9fd8a6..2f2df4ddf84 100644 --- a/hip/solver/cb_gmres_kernels.hip.cpp +++ b/hip/solver/cb_gmres_kernels.hip.cpp @@ -14,19 +14,19 @@ #include -#include "accessor/hip_helper.hpp" +#include "accessor/cuda_hip_helper.hpp" #include "accessor/range.hpp" #include "accessor/reduced_row_major.hpp" #include "accessor/scaled_reduced_row_major.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/array_access.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/solver/cb_gmres_accessor.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" @@ -118,7 +118,7 @@ void restart(std::shared_ptr exec, restart_1_kernel <<get_stream()>>>( residual->get_size()[0], residual->get_size()[1], krylov_dim, - acc::as_hip_range(krylov_bases), + acc::as_device_range(krylov_bases), as_device_type(residual_norm_collection->get_values()), residual_norm_collection->get_stride()); kernels::hip::dense::compute_norm2_dispatch(exec, residual, residual_norm, @@ -147,7 +147,7 @@ void restart(std::shared_ptr exec, residual_norm->get_stride(), as_device_type(arnoldi_norm->get_const_values() + 2 * stride_arnoldi), - stride_arnoldi, acc::as_hip_range(krylov_bases)); + stride_arnoldi, acc::as_device_range(krylov_bases)); } const auto grid_dim_2 = @@ -160,7 +160,7 @@ void restart(std::shared_ptr exec, residual->get_stride(), as_device_type(residual_norm->get_const_values()), as_device_type(residual_norm_collection->get_values()), - acc::as_hip_range(krylov_bases), + acc::as_device_range(krylov_bases), as_device_type(next_krylov_basis->get_values()), next_krylov_basis->get_stride(), as_device_type(final_iter_nums->get_data())); @@ -214,7 +214,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, as_device_type(next_krylov_basis->get_const_values()), stride_next_krylov, as_device_type(arnoldi_norm->get_values()), as_device_type(stop_status)); - // nrmP = norm(next_krylov_basis + // nrmP = norm(next_krylov_basis) zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg, hessenberg_iter->get_values()); if (dim_size[1] > 1) { @@ -222,7 +222,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, <<get_stream()>>>( dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_hip_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_values()), stride_hessenberg, as_device_type(stop_status)); } else { @@ -231,7 +231,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, exec->get_stream()>>>( dim_size[0], as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_hip_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_values()), stride_hessenberg, as_device_type(stop_status)); } @@ -243,7 +243,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, default_block_size, 0, exec->get_stream()>>>( iter + 1, dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_values()), stride_next_krylov, - acc::as_hip_range(krylov_bases), + acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_const_values()), stride_hessenberg, as_device_type(stop_status)); @@ -272,7 +272,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, exec->get_stream()>>>( dim_size[1], as_device_type(arnoldi_norm->get_values()), stride_arnoldi, as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, iter + 1, acc::as_hip_range(krylov_bases), + stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases), as_device_type(stop_status), as_device_type(reorth_status), as_device_type(num_reorth->get_data())); num_reorth_host = get_element(*num_reorth, 0); @@ -285,7 +285,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, <<get_stream()>>>( dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_hip_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(buffer_iter->get_values()), stride_buffer, as_device_type(stop_status)); } else { @@ -294,7 +294,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, exec->get_stream()>>>( dim_size[0], as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_hip_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(buffer_iter->get_values()), stride_buffer, as_device_type(stop_status)); } @@ -306,7 +306,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, default_block_size, 0, exec->get_stream()>>>( iter + 1, dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_values()), - stride_next_krylov, acc::as_hip_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_values()), stride_hessenberg, as_device_type(buffer_iter->get_const_values()), stride_buffer, @@ -338,7 +338,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, exec->get_stream()>>>( dim_size[1], as_device_type(arnoldi_norm->get_values()), stride_arnoldi, as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, iter + 1, acc::as_hip_range(krylov_bases), + stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases), as_device_type(stop_status), as_device_type(reorth_status), num_reorth->get_data()); num_reorth_host = get_element(*num_reorth, 0); @@ -350,7 +350,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, default_block_size, 0, exec->get_stream()>>>( iter, dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_values()), stride_next_krylov, - acc::as_hip_range(krylov_bases), + acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_const_values()), stride_hessenberg, as_device_type(stop_status)); // next_krylov_basis /= hessenberg(iter, iter + 1) @@ -464,7 +464,7 @@ void calculate_qy(std::shared_ptr exec, calculate_Qy_kernel <<get_stream()>>>( - num_rows, num_cols, acc::as_hip_range(krylov_bases), + num_rows, num_cols, acc::as_device_range(krylov_bases), as_device_type(y->get_const_values()), y->get_stride(), as_device_type(before_preconditioner->get_values()), stride_before_preconditioner, diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp index d05bc1a9f6f..daab3a387e6 100644 --- a/hip/solver/common_trs_kernels.hip.hpp +++ b/hip/solver/common_trs_kernels.hip.hpp @@ -10,7 +10,7 @@ #include -#include +#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #else @@ -22,12 +22,12 @@ #include +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/pointer_mode_guard.hip.hpp" -#include "hip/base/types.hip.hpp" namespace gko { @@ -63,7 +63,7 @@ struct SolveStruct : gko::solver::SolveStruct { factor_descr, unit_diag ? HIPSPARSE_DIAG_TYPE_UNIT : HIPSPARSE_DIAG_TYPE_NON_UNIT)); GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateCsrsv2Info(&solve_info)); - policy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL; + policy = SPARSELIB_SOLVE_POLICY_USE_LEVEL; } SolveStruct(const SolveStruct&) = delete; @@ -114,17 +114,17 @@ void generate_kernel(std::shared_ptr exec, if (matrix->get_size()[0] == 0) { return; } - if (hipsparse::is_supported::value) { + if (sparselib::is_supported::value) { solve_struct = std::make_shared(is_upper, unit_diag); if (auto hip_solve_struct = std::dynamic_pointer_cast( solve_struct)) { - auto handle = exec->get_hipsparse_handle(); + auto handle = exec->get_sparselib_handle(); { - hipsparse::pointer_mode_guard pm_guard(handle); - hipsparse::csrsv2_buffer_size( + sparselib::pointer_mode_guard pm_guard(handle); + sparselib::csrsv2_buffer_size( handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, matrix->get_size()[0], matrix->get_num_stored_elements(), hip_solve_struct->factor_descr, matrix->get_const_values(), @@ -139,7 +139,7 @@ void generate_kernel(std::shared_ptr exec, hip_solve_struct->factor_work_vec = exec->alloc(hip_solve_struct->factor_work_size); - hipsparse::csrsv2_analysis( + sparselib::csrsv2_analysis( handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, matrix->get_size()[0], matrix->get_num_stored_elements(), hip_solve_struct->factor_descr, matrix->get_const_values(), @@ -170,16 +170,16 @@ void solve_kernel(std::shared_ptr exec, } using vec = matrix::Dense; - if (hipsparse::is_supported::value) { + if (sparselib::is_supported::value) { if (auto hip_solve_struct = dynamic_cast(solve_struct)) { ValueType one = 1.0; - auto handle = exec->get_hipsparse_handle(); + auto handle = exec->get_sparselib_handle(); { - hipsparse::pointer_mode_guard pm_guard(handle); + sparselib::pointer_mode_guard pm_guard(handle); if (b->get_stride() == 1) { - hipsparse::csrsv2_solve( + sparselib::csrsv2_solve( handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, matrix->get_size()[0], matrix->get_num_stored_elements(), &one, @@ -194,7 +194,7 @@ void solve_kernel(std::shared_ptr exec, dense::transpose(exec, b, trans_b); dense::transpose(exec, x, trans_x); for (IndexType i = 0; i < trans_b->get_size()[0]; i++) { - hipsparse::csrsv2_solve( + sparselib::csrsv2_solve( handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, matrix->get_size()[0], matrix->get_num_stored_elements(), &one, diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp index 83dbfe61f48..b1ef414c091 100644 --- a/hip/solver/idr_kernels.hip.cpp +++ b/hip/solver/idr_kernels.hip.cpp @@ -9,20 +9,19 @@ #include -#include - - #include #include +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/randlib_bindings.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/fill_array_kernels.hpp" -#include "hip/base/hipblas_bindings.hip.hpp" -#include "hip/base/hiprand_bindings.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" @@ -71,14 +70,14 @@ void initialize_subspace_vectors(std::shared_ptr exec, bool deterministic) { if (!deterministic) { - auto gen = hiprand::rand_generator(std::random_device{}(), - HIPRAND_RNG_PSEUDO_DEFAULT, + auto gen = randlib::rand_generator(std::random_device{}(), + RANDLIB_RNG_PSEUDO_DEFAULT, exec->get_stream()); - hiprand::rand_vector( + randlib::rand_vector( gen, subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), 0.0, 1.0, subspace_vectors->get_values()); - hiprand::destroy(gen); + randlib::destroy(gen); } } @@ -147,9 +146,8 @@ void update_g_and_u(std::shared_ptr exec, as_device_type(alpha->get_values()), stop_status->get_const_data()); } else { - hipblas::dot(exec->get_hipblas_handle(), size, p_i, 1, - g_k->get_values(), g_k->get_stride(), - alpha->get_values()); + blas::dot(exec->get_blas_handle(), size, p_i, 1, g_k->get_values(), + g_k->get_stride(), alpha->get_values()); } update_g_k_and_u_kernel <<get_stride(), default_block_size), @@ -198,8 +196,8 @@ void update_m(std::shared_ptr exec, const size_type nrhs, as_device_type(g_k->get_const_values()), g_k->get_stride(), as_device_type(m_i), stop_status->get_const_data()); } else { - hipblas::dot(exec->get_hipblas_handle(), size, p_i, 1, - g_k->get_const_values(), g_k->get_stride(), m_i); + blas::dot(exec->get_blas_handle(), size, p_i, 1, + g_k->get_const_values(), g_k->get_stride(), m_i); } } } diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp index 08f35d3d674..1a43b3c0151 100644 --- a/hip/solver/lower_trs_kernels.hip.cpp +++ b/hip/solver/lower_trs_kernels.hip.cpp @@ -8,7 +8,7 @@ #include -#include +#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #else @@ -21,9 +21,9 @@ #include -#include "hip/base/hipsparse_bindings.hip.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/solver/common_trs_kernels.hip.hpp" diff --git a/hip/solver/multigrid_kernels.hip.cpp b/hip/solver/multigrid_kernels.hip.cpp index 41aab8003bd..f68105ba6d8 100644 --- a/hip/solver/multigrid_kernels.hip.cpp +++ b/hip/solver/multigrid_kernels.hip.cpp @@ -5,18 +5,16 @@ #include "core/solver/multigrid_kernels.hpp" -#include - - #include #include #include #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" #include "core/base/array_access.hpp" #include "core/components/fill_array_kernels.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp index cd6b0719844..bcb63a26bc8 100644 --- a/hip/solver/upper_trs_kernels.hip.cpp +++ b/hip/solver/upper_trs_kernels.hip.cpp @@ -8,7 +8,7 @@ #include -#include +#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #else @@ -21,9 +21,9 @@ #include -#include "hip/base/hipsparse_bindings.hip.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/solver/common_trs_kernels.hip.hpp" diff --git a/hip/stop/criterion_kernels.hip.cpp b/hip/stop/criterion_kernels.hip.cpp index 8c7caeb32b8..3d24daa5bd5 100644 --- a/hip/stop/criterion_kernels.hip.cpp +++ b/hip/stop/criterion_kernels.hip.cpp @@ -10,8 +10,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp index d790dd652f0..7f2b0646ea2 100644 --- a/hip/stop/residual_norm_kernels.hip.cpp +++ b/hip/stop/residual_norm_kernels.hip.cpp @@ -5,17 +5,15 @@ #include "core/stop/residual_norm_kernels.hpp" -#include - - #include #include #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" #include "core/base/array_access.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp index 2c25f5b3a7a..8462cbe5716 100644 --- a/hip/test/base/math.hip.cpp +++ b/hip/test/base/math.hip.cpp @@ -23,8 +23,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/test/utils.hip.hpp" diff --git a/hip/test/components/cooperative_groups.hip.cpp b/hip/test/components/cooperative_groups.hip.cpp index d22dfeca0b6..53f4b9a72a0 100644 --- a/hip/test/components/cooperative_groups.hip.cpp +++ b/hip/test/components/cooperative_groups.hip.cpp @@ -8,9 +8,6 @@ // force-top: off -#include "hip/components/cooperative_groups.hip.hpp" - - #include #include @@ -22,7 +19,8 @@ #include -#include "hip/base/types.hip.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "hip/test/utils.hip.hpp" diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp index 7bfab76f795..b8ee2f03d29 100644 --- a/hip/test/components/merging.hip.cpp +++ b/hip/test/components/merging.hip.cpp @@ -24,7 +24,7 @@ #include -#include "hip/components/cooperative_groups.hip.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "hip/test/utils.hip.hpp" diff --git a/hip/test/components/searching.hip.cpp b/hip/test/components/searching.hip.cpp index 1db0c6e9562..2662d367f4d 100644 --- a/hip/test/components/searching.hip.cpp +++ b/hip/test/components/searching.hip.cpp @@ -23,7 +23,7 @@ #include -#include "hip/components/cooperative_groups.hip.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "hip/test/utils.hip.hpp" diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index c1e3f54a720..ca20dba9007 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -1602,6 +1602,11 @@ class CudaExecutor : public detail::ExecutorBase, */ cublasContext* get_cublas_handle() const { return cublas_handle_.get(); } + /** + * @copydoc get_cublas_handle() + */ + cublasContext* get_blas_handle() const { return get_cublas_handle(); } + /** * Get the cusparse handle for this executor * @@ -1612,6 +1617,14 @@ class CudaExecutor : public detail::ExecutorBase, return cusparse_handle_.get(); } + /** + * @copydoc get_cusparse_handle() + */ + cusparseContext* get_sparselib_handle() const + { + return get_cusparse_handle(); + } + /** * Get the closest PUs * @@ -1807,6 +1820,11 @@ class HipExecutor : public detail::ExecutorBase, */ hipblasContext* get_hipblas_handle() const { return hipblas_handle_.get(); } + /** + * @copydoc get_hipblas_handle() + */ + hipblasContext* get_blas_handle() const { return get_hipblas_handle(); } + /** * Get the hipsparse handle for this executor * @@ -1817,6 +1835,14 @@ class HipExecutor : public detail::ExecutorBase, return hipsparse_handle_.get(); } + /** + * @copydoc get_hipsparse_handle() + */ + hipsparseContext* get_sparselib_handle() const + { + return get_hipsparse_handle(); + } + /** * Get the closest NUMA node * diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 59d49e44140..333bb2a9b21 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -54,7 +54,7 @@ target_sources(ginkgo_omp ) ginkgo_compile_features(ginkgo_omp) -target_compile_definitions(ginkgo_omp PRIVATE GKO_COMPILING_OMP) +target_compile_definitions(ginkgo_omp PRIVATE GKO_COMPILING_OMP GKO_GKO_DEVICE_NAMESPACE=omp) # TODO FIXME: Currently nvhpc 22.7+ optimizations break the omp jacobi's custom # precision implementation (mantissa segmentation) @@ -94,7 +94,7 @@ ginkgo_default_includes(ginkgo_omp) ginkgo_install_library(ginkgo_omp) if (GINKGO_CHECK_CIRCULAR_DEPS) - ginkgo_check_headers(ginkgo_omp GKO_COMPILING_OMP) + ginkgo_check_headers(ginkgo_omp "GKO_COMPILING_OMP;GKO_GKO_DEVICE_NAMESPACE=omp") endif() if(GINKGO_BUILD_TESTS) From 48146482c8a2dddaaf2439c8a3d566e7ec7b85eb Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 22 May 2024 14:04:35 +0200 Subject: [PATCH 008/448] fix compilation --- benchmark/CMakeLists.txt | 2 + common/cuda_hip/base/blas_bindings.hpp | 8 ++-- common/cuda_hip/base/config.hpp | 8 ++-- common/cuda_hip/base/pointer_mode_guard.hpp | 8 ++-- common/cuda_hip/base/randlib_bindings.hpp | 8 ++-- common/cuda_hip/base/runtime.hpp | 6 ++- common/cuda_hip/base/sparselib_bindings.hpp | 8 ++-- common/cuda_hip/base/thrust.hpp | 4 +- common/cuda_hip/base/types.hpp | 11 ++++- .../components/cooperative_groups.hpp | 8 ++-- .../cuda_hip/components/format_conversion.hpp | 8 ++-- common/cuda_hip/components/memory.hpp | 8 ++-- cuda/CMakeLists.txt | 34 ++++++------- ...cobi_advanced_apply_kernels.instantiate.cu | 2 +- .../jacobi_generate_kernels.instantiate.cu | 2 +- ...jacobi_simple_apply_kernels.instantiate.cu | 2 +- hip/CMakeLists.txt | 48 +++++++++---------- ... => jacobi_advanced_apply_kernels.hip.cpp} | 0 ...dvanced_apply_kernels.instantiate.hip.cpp} | 2 +- ...ip.cpp => jacobi_generate_kernels.hip.cpp} | 2 +- ...cobi_generate_kernels.instantiate.hip.cpp} | 2 +- ...pp => jacobi_simple_apply_kernels.hip.cpp} | 2 +- ..._simple_apply_kernels.instantiate.hip.cpp} | 2 +- 23 files changed, 109 insertions(+), 76 deletions(-) rename hip/preconditioner/{jacobi_advanced_apply_kernel.hip.cpp => jacobi_advanced_apply_kernels.hip.cpp} (100%) rename hip/preconditioner/{jacobi_advanced_apply_instantiate.inc.hip.cpp => jacobi_advanced_apply_kernels.instantiate.hip.cpp} (97%) rename hip/preconditioner/{jacobi_generate_kernel.hip.cpp => jacobi_generate_kernels.hip.cpp} (97%) rename hip/preconditioner/{jacobi_generate_instantiate.inc.hip.cpp => jacobi_generate_kernels.instantiate.hip.cpp} (98%) rename hip/preconditioner/{jacobi_simple_apply_kernel.hip.cpp => jacobi_simple_apply_kernels.hip.cpp} (97%) rename hip/preconditioner/{jacobi_simple_apply_instantiate.inc.hip.cpp => jacobi_simple_apply_kernels.instantiate.hip.cpp} (97%) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index ca209e65057..306655d2315 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -20,6 +20,7 @@ function(ginkgo_benchmark_cusparse_linops type def) endif() # make the dependency public to catch issues target_compile_definitions(cusparse_linops_${type} PUBLIC ${def}) + target_compile_definitions(cusparse_linops_${type} PRIVATE GKO_COMPILING_CUDA) target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse) endfunction() @@ -27,6 +28,7 @@ function(ginkgo_benchmark_hipsparse_linops type def) add_library(hipsparse_linops_${type} utils/hip_linops.hip.cpp) set_source_files_properties(utils/hip_linops.hip.cpp PROPERTIES LANGUAGE HIP) target_compile_definitions(hipsparse_linops_${type} PUBLIC ${def}) + target_compile_definitions(hipsparse_linops_${type} PRIVATE GKO_COMPILING_HIP) target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS}) target_link_libraries(hipsparse_linops_${type} Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES}) endfunction() diff --git a/common/cuda_hip/base/blas_bindings.hpp b/common/cuda_hip/base/blas_bindings.hpp index 1708fb88ce1..784d67515de 100644 --- a/common/cuda_hip/base/blas_bindings.hpp +++ b/common/cuda_hip/base/blas_bindings.hpp @@ -6,10 +6,12 @@ #define GKO_COMMON_CUDA_HIP_BASE_BLAS_BINDINGS_HPP_ -#ifdef GKO_COMPILING_HIP -#include "hip/base/hipblas_bindings.hip.hpp" -#else // GKO_COMPILING_CUDA +#ifdef GKO_COMPILING_CUDA #include "cuda/base/cublas_bindings.hpp" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/hipblas_bindings.hip.hpp" +#else +#error "Executor definition missing" #endif diff --git a/common/cuda_hip/base/config.hpp b/common/cuda_hip/base/config.hpp index d2085ae946b..2bc4b78cfd9 100644 --- a/common/cuda_hip/base/config.hpp +++ b/common/cuda_hip/base/config.hpp @@ -6,10 +6,12 @@ #define GKO_COMMON_CUDA_HIP_BASE_CONFIG_HPP_ -#ifdef GKO_COMPILING_HIP -#include "hip/base/config.hip.hpp" -#else // GKO_COMPILING_CUDA +#ifdef GKO_COMPILING_CUDA #include "cuda/base/config.hpp" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/config.hip.hpp" +#else +#error "Executor definition missing" #endif diff --git a/common/cuda_hip/base/pointer_mode_guard.hpp b/common/cuda_hip/base/pointer_mode_guard.hpp index 41ff6242e49..ddc51557ac4 100644 --- a/common/cuda_hip/base/pointer_mode_guard.hpp +++ b/common/cuda_hip/base/pointer_mode_guard.hpp @@ -6,10 +6,12 @@ #define GKO_COMMON_CUDA_HIP_BASE_POINTER_MODE_GUARD_HPP_ -#ifdef GKO_COMPILING_HIP -#include "hip/base/pointer_mode_guard.hip.hpp" -#else // GKO_COMPILING_CUDA +#ifdef GKO_COMPILING_CUDA #include "cuda/base/pointer_mode_guard.hpp" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/pointer_mode_guard.hip.hpp" +#else +#error "Executor definition missing" #endif diff --git a/common/cuda_hip/base/randlib_bindings.hpp b/common/cuda_hip/base/randlib_bindings.hpp index 249489b0e68..d7d023d2b70 100644 --- a/common/cuda_hip/base/randlib_bindings.hpp +++ b/common/cuda_hip/base/randlib_bindings.hpp @@ -6,10 +6,12 @@ #define GKO_COMMON_CUDA_HIP_BASE_RANDLIB_BINDINGS_HPP_ -#ifdef GKO_COMPILING_HIP -#include "hip/base/hiprand_bindings.hip.hpp" -#else // GKO_COMPILING_CUDA +#ifdef GKO_COMPILING_CUDA #include "cuda/base/curand_bindings.hpp" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/hiprand_bindings.hip.hpp" +#else +#error "Executor definition missing" #endif diff --git a/common/cuda_hip/base/runtime.hpp b/common/cuda_hip/base/runtime.hpp index ccddfdd2661..2020c744b71 100644 --- a/common/cuda_hip/base/runtime.hpp +++ b/common/cuda_hip/base/runtime.hpp @@ -6,8 +6,12 @@ #define GKO_COMMON_CUDA_HIP_BASE_RUNTIME_HPP_ -#ifdef GKO_COMPILING_HIP +#ifdef GKO_COMPILING_CUDA +// nothing needed here +#elif defined(GKO_COMPILING_HIP) #include +#else +#error "Executor definition missing" #endif diff --git a/common/cuda_hip/base/sparselib_bindings.hpp b/common/cuda_hip/base/sparselib_bindings.hpp index bc565f9190a..eeb7cef0734 100644 --- a/common/cuda_hip/base/sparselib_bindings.hpp +++ b/common/cuda_hip/base/sparselib_bindings.hpp @@ -6,10 +6,12 @@ #define GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BINDINGS_HPP_ -#ifdef GKO_COMPILING_HIP -#include "hip/base/hipsparse_bindings.hip.hpp" -#else // GKO_COMPILING_CUDA +#ifdef GKO_COMPILING_CUDA #include "cuda/base/cusparse_bindings.hpp" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/hipsparse_bindings.hip.hpp" +#else +#error "Executor definition missing" #endif diff --git a/common/cuda_hip/base/thrust.hpp b/common/cuda_hip/base/thrust.hpp index f2015d6d544..adc904d550c 100644 --- a/common/cuda_hip/base/thrust.hpp +++ b/common/cuda_hip/base/thrust.hpp @@ -31,7 +31,7 @@ inline auto thrust_policy(std::shared_ptr exec) { return thrust::cuda::par.on(exec->get_stream()); } -#else +#elif defined(GKO_COMPILING_HIP) inline auto thrust_policy(std::shared_ptr exec) { #if GINKGO_HIP_PLATFORM_HCC @@ -40,6 +40,8 @@ inline auto thrust_policy(std::shared_ptr exec) return thrust::cuda::par.on(exec->get_stream()); #endif } +#else +#error "Executor definition missing" #endif diff --git a/common/cuda_hip/base/types.hpp b/common/cuda_hip/base/types.hpp index 213664d3a4d..ff6302a68fa 100644 --- a/common/cuda_hip/base/types.hpp +++ b/common/cuda_hip/base/types.hpp @@ -2,8 +2,17 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_ + + #ifdef GKO_COMPILING_CUDA #include "cuda/base/types.hpp" -#else +#elif defined(GKO_COMPILING_HIP) #include "hip/base/types.hip.hpp" +#else +#error "Executor definition missing" #endif + + +#endif // GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_ diff --git a/common/cuda_hip/components/cooperative_groups.hpp b/common/cuda_hip/components/cooperative_groups.hpp index b1f17842302..64c9be4fa8e 100644 --- a/common/cuda_hip/components/cooperative_groups.hpp +++ b/common/cuda_hip/components/cooperative_groups.hpp @@ -6,10 +6,12 @@ #define GKO_COMMON_CUDA_HIP_COMPONENTS_COOPERATIVE_GROUPS_HPP_ -#ifdef GKO_COMPILING_HIP -#include "hip/components/cooperative_groups.hip.hpp" -#else // GKO_COMPILING_CUDA +#ifdef GKO_COMPILING_CUDA #include "cuda/components/cooperative_groups.cuh" +#elif defined(GKO_COMPILING_HIP) +#include "hip/components/cooperative_groups.hip.hpp" +#else +#error "Executor definition missing" #endif diff --git a/common/cuda_hip/components/format_conversion.hpp b/common/cuda_hip/components/format_conversion.hpp index a16d09b2e3a..af6461ccd5e 100644 --- a/common/cuda_hip/components/format_conversion.hpp +++ b/common/cuda_hip/components/format_conversion.hpp @@ -6,10 +6,12 @@ #define GKO_COMMON_CUDA_HIP_COMPONENTS_FORMAT_CONVERSION_HPP_ -#ifdef GKO_COMPILING_HIP -#include "hip/components/format_conversion.hip.hpp" -#else // GKO_COMPILING_CUDA +#ifdef GKO_COMPILING_CUDA #include "cuda/components/format_conversion.cuh" +#elif defined(GKO_COMPILING_HIP) +#include "hip/components/format_conversion.hip.hpp" +#else +#error "Executor definition missing" #endif diff --git a/common/cuda_hip/components/memory.hpp b/common/cuda_hip/components/memory.hpp index 974431e2fb8..e7b1de548c6 100644 --- a/common/cuda_hip/components/memory.hpp +++ b/common/cuda_hip/components/memory.hpp @@ -6,10 +6,12 @@ #define GKO_COMMON_CUDA_HIP_COMPONENTS_MEMORY_HPP_ -#ifdef GKO_COMPILING_HIP -#include "hip/components/memory.hip.hpp" -#else // GKO_COMPILING_CUDA +#ifdef GKO_COMPILING_CUDA #include "cuda/components/memory.cuh" +#elif defined(GKO_COMPILING_HIP) +#include "hip/components/memory.hip.hpp" +#else +#error "Executor definition missing" #endif diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 88ae83e9005..3d251ecfa82 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -33,12 +33,12 @@ target_sources(ginkgo_cuda factorization/par_ic_kernels.cu factorization/par_ict_kernels.cu factorization/par_ilu_kernels.cu - factorization/par_ilut_approx_filter_kernel.cu - factorization/par_ilut_filter_kernel.cu + factorization/par_ilut_approx_filter_kernels.cu + factorization/par_ilut_filter_kernels.cu factorization/par_ilut_select_common.cu - factorization/par_ilut_select_kernel.cu - factorization/par_ilut_spgeam_kernel.cu - factorization/par_ilut_sweep_kernel.cu + factorization/par_ilut_select_kernels.cu + factorization/par_ilut_spgeam_kernels.cu + factorization/par_ilut_sweep_kernels.cu matrix/batch_csr_kernels.cu matrix/batch_dense_kernels.cu matrix/batch_ell_kernels.cu @@ -54,10 +54,10 @@ target_sources(ginkgo_cuda multigrid/pgm_kernels.cu preconditioner/batch_jacobi_kernels.cu preconditioner/isai_kernels.cu - preconditioner/jacobi_advanced_apply_kernel.cu - preconditioner/jacobi_generate_kernel.cu + preconditioner/jacobi_advanced_apply_kernels.cu + preconditioner/jacobi_generate_kernels.cu preconditioner/jacobi_kernels.cu - preconditioner/jacobi_simple_apply_kernel.cu + preconditioner/jacobi_simple_apply_kernels.cu reorder/rcm_kernels.cu solver/batch_bicgstab_kernels.cu solver/batch_cg_kernels.cu @@ -85,18 +85,18 @@ endif() set(GKO_CUDA_JACOBI_SOURCES) foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_CUDA_JACOBI_BLOCK_SIZES) configure_file( - preconditioner/jacobi_generate_instantiate.inc.cu - preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) + preconditioner/jacobi_generate_kernels.instantiate.cu + preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) configure_file( - preconditioner/jacobi_simple_apply_instantiate.inc.cu - preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) + preconditioner/jacobi_simple_apply_kernels.instantiate.cu + preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) configure_file( - preconditioner/jacobi_advanced_apply_instantiate.inc.cu - preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) + preconditioner/jacobi_advanced_apply_kernels.instantiate.cu + preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) list(APPEND GKO_CUDA_JACOBI_SOURCES - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) endforeach() target_sources(ginkgo_cuda PRIVATE ${GKO_CUDA_JACOBI_SOURCES}) string(REPLACE ";" "," GKO_CUDA_JACOBI_BLOCK_SIZES_CODE "${GKO_CUDA_JACOBI_BLOCK_SIZES}") diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu index ed33437c613..10ede90da7e 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu @@ -32,7 +32,7 @@ namespace cuda { namespace jacobi { -#include +#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc" // clang-format off diff --git a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu index 56e8ff6f16f..129c50625f4 100644 --- a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu +++ b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu @@ -35,7 +35,7 @@ namespace cuda { namespace jacobi { -#include +#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc" // clang-format off diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu index 97a7bfff489..15f6dc138ad 100644 --- a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu +++ b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu @@ -32,7 +32,7 @@ namespace cuda { namespace jacobi { -#include +#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc" // clang-format off diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index de44eb20682..99e167b9798 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.21) include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) -add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_INSTANTIATE) -add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_INSTANTIATE) +add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_kernels.instantiate) +add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_kernels.instantiate) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) set(GINKGO_HIP_SOURCES @@ -31,30 +31,30 @@ set(GINKGO_HIP_SOURCES factorization/par_ic_kernels.hip.cpp factorization/par_ict_kernels.hip.cpp factorization/par_ilu_kernels.hip.cpp - factorization/par_ilut_approx_filter_kernel.hip.cpp - factorization/par_ilut_filter_kernel.hip.cpp + factorization/par_ilut_approx_filter_kernels.hip.cpp + factorization/par_ilut_filter_kernels.hip.cpp factorization/par_ilut_select_common.hip.cpp - factorization/par_ilut_select_kernel.hip.cpp - factorization/par_ilut_spgeam_kernel.hip.cpp - factorization/par_ilut_sweep_kernel.hip.cpp + factorization/par_ilut_select_kernels.hip.cpp + factorization/par_ilut_spgeam_kernels.hip.cpp + factorization/par_ilut_sweep_kernels.hip.cpp matrix/batch_csr_kernels.hip.cpp matrix/batch_dense_kernels.hip.cpp matrix/batch_ell_kernels.hip.cpp matrix/coo_kernels.hip.cpp - ${CSR_INSTANTIATE} + ${CSR_kernels.instantiate} matrix/dense_kernels.hip.cpp matrix/diagonal_kernels.hip.cpp matrix/ell_kernels.hip.cpp - ${FBCSR_INSTANTIATE} + ${FBCSR_kernels.instantiate} matrix/sellp_kernels.hip.cpp matrix/sparsity_csr_kernels.hip.cpp multigrid/pgm_kernels.hip.cpp preconditioner/batch_jacobi_kernels.hip.cpp preconditioner/isai_kernels.hip.cpp - preconditioner/jacobi_advanced_apply_kernel.hip.cpp - preconditioner/jacobi_generate_kernel.hip.cpp + preconditioner/jacobi_advanced_apply_kernels.hip.cpp + preconditioner/jacobi_generate_kernels.hip.cpp preconditioner/jacobi_kernels.hip.cpp - preconditioner/jacobi_simple_apply_kernel.hip.cpp + preconditioner/jacobi_simple_apply_kernels.hip.cpp reorder/rcm_kernels.hip.cpp solver/batch_bicgstab_kernels.hip.cpp solver/batch_cg_kernels.hip.cpp @@ -86,28 +86,28 @@ else() endif() foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_HIP_JACOBI_BLOCK_SIZES) configure_file( - preconditioner/jacobi_generate_instantiate.inc.hip.cpp - preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) + preconditioner/jacobi_generate_kernels.instantiate.hip.cpp + preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) configure_file( - preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp - preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) + preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp + preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) configure_file( - preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp - preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) + preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp + preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) # The 3D indexing used in Jacobi kernel triggers an instruction selection bug in Debug builds # Probably the same as https://github.com/llvm/llvm-project/issues/67574 # Fixed in ROCm 6.0 https://github.com/ROCm/llvm-project/commit/cd7f574a1fd1d3f3e8b9c1cae61fa8133a51de5f # and in LLVM trunk https://github.com/llvm/llvm-project/commit/cc3d2533cc2e4ea06981b86ede5087fbf801e789 set_source_files_properties( - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp PROPERTIES COMPILE_OPTIONS $<$:-O2>) list(APPEND GINKGO_HIP_SOURCES - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) endforeach() string(REPLACE ";" "," GKO_HIP_JACOBI_BLOCK_SIZES_CODE "${GKO_HIP_JACOBI_BLOCK_SIZES}") configure_file(preconditioner/jacobi_common.hip.hpp.in preconditioner/jacobi_common.hip.hpp) diff --git a/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp similarity index 100% rename from hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp rename to hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp diff --git a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp similarity index 97% rename from hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp rename to hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp index 67a65385ca4..358c6f3b337 100644 --- a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp @@ -33,7 +33,7 @@ namespace hip { namespace jacobi { -#include +#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc" // clang-format off diff --git a/hip/preconditioner/jacobi_generate_kernel.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.hip.cpp similarity index 97% rename from hip/preconditioner/jacobi_generate_kernel.hip.cpp rename to hip/preconditioner/jacobi_generate_kernels.hip.cpp index 50bf72ea964..6365f6c132e 100644 --- a/hip/preconditioner/jacobi_generate_kernel.hip.cpp +++ b/hip/preconditioner/jacobi_generate_kernels.hip.cpp @@ -36,7 +36,7 @@ namespace hip { namespace jacobi { -#include +#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc" template +#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc" // clang-format off diff --git a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp similarity index 97% rename from hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp rename to hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp index e8e247210ec..37b78f17469 100644 --- a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp @@ -33,7 +33,7 @@ namespace hip { namespace jacobi { -#include +#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc" template +#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc" // clang-format off From 334bca2d16f3bfaafa2f45ab7af9d1296aa93ef2 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 24 Jun 2024 14:21:56 +0200 Subject: [PATCH 009/448] review updates - fix HIP compilation issues - uniform ifdef checks - deviceComplex type aliases - remove unnecessary includes Co-authored-by: Marcel Koch --- common/cuda_hip/base/blas_bindings.hpp | 2 +- common/cuda_hip/base/config.hpp | 2 +- common/cuda_hip/base/pointer_mode_guard.hpp | 2 +- common/cuda_hip/base/randlib_bindings.hpp | 2 +- common/cuda_hip/base/runtime.hpp | 2 +- common/cuda_hip/base/sparselib_bindings.hpp | 2 +- common/cuda_hip/base/thrust.hpp | 2 +- common/cuda_hip/base/types.hpp | 2 +- common/cuda_hip/components/cooperative_groups.hpp | 2 +- common/cuda_hip/components/format_conversion.hpp | 2 +- common/cuda_hip/components/memory.hpp | 2 +- cuda/base/types.hpp | 4 ++-- cuda/distributed/vector_kernels.cu | 3 --- cuda/factorization/par_ilu_kernels.cu | 1 - hip/base/config.hip.hpp | 3 --- hip/base/types.hip.hpp | 4 ++-- hip/distributed/vector_kernels.hip.cpp | 3 --- hip/factorization/par_ilu_kernels.hip.cpp | 1 - 18 files changed, 15 insertions(+), 26 deletions(-) diff --git a/common/cuda_hip/base/blas_bindings.hpp b/common/cuda_hip/base/blas_bindings.hpp index 784d67515de..e59bbf0d7a0 100644 --- a/common/cuda_hip/base/blas_bindings.hpp +++ b/common/cuda_hip/base/blas_bindings.hpp @@ -6,7 +6,7 @@ #define GKO_COMMON_CUDA_HIP_BASE_BLAS_BINDINGS_HPP_ -#ifdef GKO_COMPILING_CUDA +#if defined(GKO_COMPILING_CUDA) #include "cuda/base/cublas_bindings.hpp" #elif defined(GKO_COMPILING_HIP) #include "hip/base/hipblas_bindings.hip.hpp" diff --git a/common/cuda_hip/base/config.hpp b/common/cuda_hip/base/config.hpp index 2bc4b78cfd9..00825fe8b72 100644 --- a/common/cuda_hip/base/config.hpp +++ b/common/cuda_hip/base/config.hpp @@ -6,7 +6,7 @@ #define GKO_COMMON_CUDA_HIP_BASE_CONFIG_HPP_ -#ifdef GKO_COMPILING_CUDA +#if defined(GKO_COMPILING_CUDA) #include "cuda/base/config.hpp" #elif defined(GKO_COMPILING_HIP) #include "hip/base/config.hip.hpp" diff --git a/common/cuda_hip/base/pointer_mode_guard.hpp b/common/cuda_hip/base/pointer_mode_guard.hpp index ddc51557ac4..40bf694ef73 100644 --- a/common/cuda_hip/base/pointer_mode_guard.hpp +++ b/common/cuda_hip/base/pointer_mode_guard.hpp @@ -6,7 +6,7 @@ #define GKO_COMMON_CUDA_HIP_BASE_POINTER_MODE_GUARD_HPP_ -#ifdef GKO_COMPILING_CUDA +#if defined(GKO_COMPILING_CUDA) #include "cuda/base/pointer_mode_guard.hpp" #elif defined(GKO_COMPILING_HIP) #include "hip/base/pointer_mode_guard.hip.hpp" diff --git a/common/cuda_hip/base/randlib_bindings.hpp b/common/cuda_hip/base/randlib_bindings.hpp index d7d023d2b70..7797ad38c64 100644 --- a/common/cuda_hip/base/randlib_bindings.hpp +++ b/common/cuda_hip/base/randlib_bindings.hpp @@ -6,7 +6,7 @@ #define GKO_COMMON_CUDA_HIP_BASE_RANDLIB_BINDINGS_HPP_ -#ifdef GKO_COMPILING_CUDA +#if defined(GKO_COMPILING_CUDA) #include "cuda/base/curand_bindings.hpp" #elif defined(GKO_COMPILING_HIP) #include "hip/base/hiprand_bindings.hip.hpp" diff --git a/common/cuda_hip/base/runtime.hpp b/common/cuda_hip/base/runtime.hpp index 2020c744b71..6a7a7a3c4a2 100644 --- a/common/cuda_hip/base/runtime.hpp +++ b/common/cuda_hip/base/runtime.hpp @@ -6,7 +6,7 @@ #define GKO_COMMON_CUDA_HIP_BASE_RUNTIME_HPP_ -#ifdef GKO_COMPILING_CUDA +#if defined(GKO_COMPILING_CUDA) // nothing needed here #elif defined(GKO_COMPILING_HIP) #include diff --git a/common/cuda_hip/base/sparselib_bindings.hpp b/common/cuda_hip/base/sparselib_bindings.hpp index eeb7cef0734..26c0bda236d 100644 --- a/common/cuda_hip/base/sparselib_bindings.hpp +++ b/common/cuda_hip/base/sparselib_bindings.hpp @@ -6,7 +6,7 @@ #define GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BINDINGS_HPP_ -#ifdef GKO_COMPILING_CUDA +#if defined(GKO_COMPILING_CUDA) #include "cuda/base/cusparse_bindings.hpp" #elif defined(GKO_COMPILING_HIP) #include "hip/base/hipsparse_bindings.hip.hpp" diff --git a/common/cuda_hip/base/thrust.hpp b/common/cuda_hip/base/thrust.hpp index adc904d550c..02aaebc9f3d 100644 --- a/common/cuda_hip/base/thrust.hpp +++ b/common/cuda_hip/base/thrust.hpp @@ -26,7 +26,7 @@ namespace kernels { namespace GKO_DEVICE_NAMESPACE { -#ifdef GKO_COMPILING_CUDA +#if defined(GKO_COMPILING_CUDA) inline auto thrust_policy(std::shared_ptr exec) { return thrust::cuda::par.on(exec->get_stream()); diff --git a/common/cuda_hip/base/types.hpp b/common/cuda_hip/base/types.hpp index ff6302a68fa..08f0516d691 100644 --- a/common/cuda_hip/base/types.hpp +++ b/common/cuda_hip/base/types.hpp @@ -6,7 +6,7 @@ #define GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_ -#ifdef GKO_COMPILING_CUDA +#if defined(GKO_COMPILING_CUDA) #include "cuda/base/types.hpp" #elif defined(GKO_COMPILING_HIP) #include "hip/base/types.hip.hpp" diff --git a/common/cuda_hip/components/cooperative_groups.hpp b/common/cuda_hip/components/cooperative_groups.hpp index 64c9be4fa8e..a57440f6d30 100644 --- a/common/cuda_hip/components/cooperative_groups.hpp +++ b/common/cuda_hip/components/cooperative_groups.hpp @@ -6,7 +6,7 @@ #define GKO_COMMON_CUDA_HIP_COMPONENTS_COOPERATIVE_GROUPS_HPP_ -#ifdef GKO_COMPILING_CUDA +#if defined(GKO_COMPILING_CUDA) #include "cuda/components/cooperative_groups.cuh" #elif defined(GKO_COMPILING_HIP) #include "hip/components/cooperative_groups.hip.hpp" diff --git a/common/cuda_hip/components/format_conversion.hpp b/common/cuda_hip/components/format_conversion.hpp index af6461ccd5e..9faf7a58c25 100644 --- a/common/cuda_hip/components/format_conversion.hpp +++ b/common/cuda_hip/components/format_conversion.hpp @@ -6,7 +6,7 @@ #define GKO_COMMON_CUDA_HIP_COMPONENTS_FORMAT_CONVERSION_HPP_ -#ifdef GKO_COMPILING_CUDA +#if defined(GKO_COMPILING_CUDA) #include "cuda/components/format_conversion.cuh" #elif defined(GKO_COMPILING_HIP) #include "hip/components/format_conversion.hip.hpp" diff --git a/common/cuda_hip/components/memory.hpp b/common/cuda_hip/components/memory.hpp index e7b1de548c6..9bfd9cba1e0 100644 --- a/common/cuda_hip/components/memory.hpp +++ b/common/cuda_hip/components/memory.hpp @@ -6,7 +6,7 @@ #define GKO_COMMON_CUDA_HIP_COMPONENTS_MEMORY_HPP_ -#ifdef GKO_COMPILING_CUDA +#if defined(GKO_COMPILING_CUDA) #include "cuda/components/memory.cuh" #elif defined(GKO_COMPILING_HIP) #include "hip/components/memory.hip.hpp" diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 510d7cef889..561612f2869 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -394,8 +394,8 @@ GKO_INLINE GKO_ATTRIBUTES constexpr } -using gpuComplex = cuComplex; -using gpuDoubleComplex = cuDoubleComplex; +using deviceComplex = cuComplex; +using deviceDoubleComplex = cuDoubleComplex; } // namespace cuda diff --git a/cuda/distributed/vector_kernels.cu b/cuda/distributed/vector_kernels.cu index 7b06ada9f0e..ca9c419239b 100644 --- a/cuda/distributed/vector_kernels.cu +++ b/cuda/distributed/vector_kernels.cu @@ -5,9 +5,6 @@ #include "core/distributed/vector_kernels.hpp" -#include - - #include #include #include diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu index cd48dd2a9db..755723e7d4c 100644 --- a/cuda/factorization/par_ilu_kernels.cu +++ b/cuda/factorization/par_ilu_kernels.cu @@ -5,7 +5,6 @@ #include "core/factorization/par_ilu_kernels.hpp" -#include #include diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp index 3f531616489..89dc67255fc 100644 --- a/hip/base/config.hip.hpp +++ b/hip/base/config.hip.hpp @@ -6,9 +6,6 @@ #define GKO_HIP_BASE_CONFIG_HIP_HPP_ -#include - - #include diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index 883e5812080..cb33cbf5df8 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -432,8 +432,8 @@ GKO_INLINE GKO_ATTRIBUTES constexpr } -using gpuComplex = hipComplex; -using gpuDoubleComplex = hipDoubleComplex; +using deviceComplex = hipComplex; +using deviceDoubleComplex = hipDoubleComplex; } // namespace hip diff --git a/hip/distributed/vector_kernels.hip.cpp b/hip/distributed/vector_kernels.hip.cpp index 320d847ed85..fc6718dec0d 100644 --- a/hip/distributed/vector_kernels.hip.cpp +++ b/hip/distributed/vector_kernels.hip.cpp @@ -5,9 +5,6 @@ #include "core/distributed/vector_kernels.hpp" -#include - - #include #include #include diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp index fc05273bb09..49608d6801f 100644 --- a/hip/factorization/par_ilu_kernels.hip.cpp +++ b/hip/factorization/par_ilu_kernels.hip.cpp @@ -5,7 +5,6 @@ #include "core/factorization/par_ilu_kernels.hpp" -#include #include From 8162796cf0dc74461aaa9078aa242b9b3eefe488 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 24 Jun 2024 15:49:44 +0200 Subject: [PATCH 010/448] fix replacement errors --- dpcpp/CMakeLists.txt | 2 +- hip/CMakeLists.txt | 10 +++++----- omp/CMakeLists.txt | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 035134ac4e1..ee373243842 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -126,7 +126,7 @@ ginkgo_default_includes(ginkgo_dpcpp) ginkgo_install_library(ginkgo_dpcpp) if (GINKGO_CHECK_CIRCULAR_DEPS) - ginkgo_check_headers(ginkgo_dpcpp "GKO_COMPILING_DPCPP;GKO_GKO_DEVICE_NAMESPACE=dpcpp") + ginkgo_check_headers(ginkgo_dpcpp "GKO_COMPILING_DPCPP;GKO_DEVICE_NAMESPACE=dpcpp") endif() if(GINKGO_BUILD_TESTS) diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 99e167b9798..bf2d6a6cf58 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.21) include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) -add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_kernels.instantiate) -add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_kernels.instantiate) +add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_INSTANTIATE) +add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_INSTANTIATE) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) set(GINKGO_HIP_SOURCES @@ -41,11 +41,11 @@ set(GINKGO_HIP_SOURCES matrix/batch_dense_kernels.hip.cpp matrix/batch_ell_kernels.hip.cpp matrix/coo_kernels.hip.cpp - ${CSR_kernels.instantiate} + ${CSR_INSTANTIATE} matrix/dense_kernels.hip.cpp matrix/diagonal_kernels.hip.cpp matrix/ell_kernels.hip.cpp - ${FBCSR_kernels.instantiate} + ${FBCSR_INSTANTIATE} matrix/sellp_kernels.hip.cpp matrix/sparsity_csr_kernels.hip.cpp multigrid/pgm_kernels.hip.cpp @@ -138,7 +138,7 @@ ginkgo_default_includes(ginkgo_hip) ginkgo_install_library(ginkgo_hip) if (GINKGO_CHECK_CIRCULAR_DEPS) - ginkgo_check_headers(ginkgo_hip "GKO_COMPILING_HIP;GKO_GKO_DEVICE_NAMESPACE=hip") + ginkgo_check_headers(ginkgo_hip "GKO_COMPILING_HIP;GKO_DEVICE_NAMESPACE=hip") endif() if(GINKGO_BUILD_TESTS) diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 333bb2a9b21..41bec80673f 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -54,7 +54,7 @@ target_sources(ginkgo_omp ) ginkgo_compile_features(ginkgo_omp) -target_compile_definitions(ginkgo_omp PRIVATE GKO_COMPILING_OMP GKO_GKO_DEVICE_NAMESPACE=omp) +target_compile_definitions(ginkgo_omp PRIVATE GKO_COMPILING_OMP GKO_DEVICE_NAMESPACE=omp) # TODO FIXME: Currently nvhpc 22.7+ optimizations break the omp jacobi's custom # precision implementation (mantissa segmentation) @@ -94,7 +94,7 @@ ginkgo_default_includes(ginkgo_omp) ginkgo_install_library(ginkgo_omp) if (GINKGO_CHECK_CIRCULAR_DEPS) - ginkgo_check_headers(ginkgo_omp "GKO_COMPILING_OMP;GKO_GKO_DEVICE_NAMESPACE=omp") + ginkgo_check_headers(ginkgo_omp "GKO_COMPILING_OMP;GKO_DEVICE_NAMESPACE=omp") endif() if(GINKGO_BUILD_TESTS) From 8be5bcd81506f800306bc885eb41cc365d633ebe Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 26 Jun 2024 18:12:00 +0200 Subject: [PATCH 011/448] review updates - make sparselib/blas the only non-deprecated way of getting handles - fix header orders Co-authored-by: Yuhsiang M. Tsai --- benchmark/utils/cuda_linops.cpp | 16 ++++++++-------- benchmark/utils/hip_linops.hip.cpp | 8 ++++---- cuda/solver/common_trs_kernels.cuh | 4 ++-- hip/base/exception.hip.cpp | 4 +++- hip/base/hipblas_bindings.hip.hpp | 2 +- hip/base/hiprand_bindings.hip.hpp | 2 +- hip/base/hipsparse_bindings.hip.hpp | 2 +- hip/base/hipsparse_block_bindings.hip.hpp | 2 +- hip/base/pointer_mode_guard.hip.hpp | 4 +++- hip/base/types.hip.hpp | 4 +++- hip/matrix/fft_kernels.hip.cpp | 4 +++- hip/solver/common_trs_kernels.hip.hpp | 2 +- hip/solver/lower_trs_kernels.hip.cpp | 2 +- hip/solver/upper_trs_kernels.hip.cpp | 2 +- include/ginkgo/core/base/executor.hpp | 20 ++++++++++++-------- 15 files changed, 45 insertions(+), 33 deletions(-) diff --git a/benchmark/utils/cuda_linops.cpp b/benchmark/utils/cuda_linops.cpp index f239740d655..1222301a4cf 100644 --- a/benchmark/utils/cuda_linops.cpp +++ b/benchmark/utils/cuda_linops.cpp @@ -139,7 +139,7 @@ class CusparseCsrmp auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::cuda::cusparse::spmv_mp( - this->get_gpu_exec()->get_cusparse_handle(), trans_, + this->get_gpu_exec()->get_sparselib_handle(), trans_, this->get_size()[0], this->get_size()[1], csr_->get_num_stored_elements(), &scalars.get_const_data()[0], this->get_descr(), csr_->get_const_values(), @@ -213,7 +213,7 @@ class CusparseCsr auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::cuda::cusparse::spmv( - this->get_gpu_exec()->get_cusparse_handle(), trans_, + this->get_gpu_exec()->get_sparselib_handle(), trans_, this->get_size()[0], this->get_size()[1], csr_->get_num_stored_elements(), &scalars.get_const_data()[0], this->get_descr(), csr_->get_const_values(), @@ -288,7 +288,7 @@ class CusparseCsrmm auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::cuda::cusparse::spmm( - this->get_gpu_exec()->get_cusparse_handle(), trans_, + this->get_gpu_exec()->get_sparselib_handle(), trans_, this->get_size()[0], dense_b->get_size()[1], this->get_size()[1], csr_->get_num_stored_elements(), &scalars.get_const_data()[0], this->get_descr(), csr_->get_const_values(), @@ -376,7 +376,7 @@ class CusparseCsrEx gko::size_type buffer_size = 0; auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); - auto handle = this->get_gpu_exec()->get_cusparse_handle(); + auto handle = this->get_gpu_exec()->get_sparselib_handle(); // This function seems to require the pointer mode to be set to HOST. // Ginkgo use pointer mode DEVICE by default, so we change this // temporarily. @@ -465,7 +465,7 @@ class CusparseHybrid auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::cuda::cusparse::csr2hyb( - this->get_gpu_exec()->get_cusparse_handle(), this->get_size()[0], + this->get_gpu_exec()->get_sparselib_handle(), this->get_size()[0], this->get_size()[1], this->get_descr(), t_csr->get_const_values(), t_csr->get_const_row_ptrs(), t_csr->get_const_col_idxs(), hyb_, Threshold, Partition); @@ -496,7 +496,7 @@ class CusparseHybrid auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::cuda::cusparse::spmv( - this->get_gpu_exec()->get_cusparse_handle(), trans_, + this->get_gpu_exec()->get_sparselib_handle(), trans_, &scalars.get_const_data()[0], this->get_descr(), hyb_, db, &scalars.get_const_data()[1], dx); } @@ -555,13 +555,13 @@ void cusparse_generic_spmv(std::shared_ptr gpu_exec, gko::size_type buffer_size = 0; GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV_bufferSize( - gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0], + gpu_exec->get_sparselib_handle(), trans, &scalars.get_const_data()[0], mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, &buffer_size)); gko::array buffer_array(gpu_exec, buffer_size); auto dbuffer = buffer_array.get_data(); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV( - gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0], + gpu_exec->get_sparselib_handle(), trans, &scalars.get_const_data()[0], mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, dbuffer)); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecx)); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecb)); diff --git a/benchmark/utils/hip_linops.hip.cpp b/benchmark/utils/hip_linops.hip.cpp index 2d952ce60e9..bcbeee5ca14 100644 --- a/benchmark/utils/hip_linops.hip.cpp +++ b/benchmark/utils/hip_linops.hip.cpp @@ -126,7 +126,7 @@ class HipsparseCsr auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::hip::hipsparse::spmv( - this->get_gpu_exec()->get_hipsparse_handle(), trans_, + this->get_gpu_exec()->get_sparselib_handle(), trans_, this->get_size()[0], this->get_size()[1], csr_->get_num_stored_elements(), &scalars.get_const_data()[0], this->get_descr(), csr_->get_const_values(), @@ -201,7 +201,7 @@ class HipsparseCsrmm auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::hip::hipsparse::spmm( - this->get_gpu_exec()->get_hipsparse_handle(), trans_, + this->get_gpu_exec()->get_sparselib_handle(), trans_, this->get_size()[0], dense_b->get_size()[1], this->get_size()[1], csr_->get_num_stored_elements(), &scalars.get_const_data()[0], this->get_descr(), csr_->get_const_values(), @@ -269,7 +269,7 @@ class HipsparseHybrid auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::hip::hipsparse::csr2hyb( - this->get_gpu_exec()->get_hipsparse_handle(), this->get_size()[0], + this->get_gpu_exec()->get_sparselib_handle(), this->get_size()[0], this->get_size()[1], this->get_descr(), t_csr->get_const_values(), t_csr->get_const_row_ptrs(), t_csr->get_const_col_idxs(), hyb_, Threshold, Partition); @@ -300,7 +300,7 @@ class HipsparseHybrid auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::hip::hipsparse::spmv( - this->get_gpu_exec()->get_hipsparse_handle(), trans_, + this->get_gpu_exec()->get_sparselib_handle(), trans_, &scalars.get_const_data()[0], this->get_descr(), hyb_, db, &scalars.get_const_data()[1], dx); } diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 549925bf2e7..9013f9172bc 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -66,7 +66,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct { CudaSolveStruct(std::shared_ptr exec, const matrix::Csr* matrix, size_type num_rhs, bool is_upper, bool unit_diag) - : handle{exec->get_cusparse_handle()}, + : handle{exec->get_sparselib_handle()}, spsm_descr{}, descr_a{}, num_rhs{num_rhs}, @@ -189,7 +189,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct { const matrix::Csr* matrix, size_type num_rhs, bool is_upper, bool unit_diag) : exec{exec}, - handle{exec->get_cusparse_handle()}, + handle{exec->get_sparselib_handle()}, algorithm{}, solve_info{}, policy{}, diff --git a/hip/base/exception.hip.cpp b/hip/base/exception.hip.cpp index 3f569576c28..f0e17f4e873 100644 --- a/hip/base/exception.hip.cpp +++ b/hip/base/exception.hip.cpp @@ -8,7 +8,6 @@ #include -#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #include @@ -23,6 +22,9 @@ #include +#include "common/cuda_hip/base/runtime.hpp" + + namespace gko { diff --git a/hip/base/hipblas_bindings.hip.hpp b/hip/base/hipblas_bindings.hip.hpp index 725c7e20698..d5dc94d6138 100644 --- a/hip/base/hipblas_bindings.hip.hpp +++ b/hip/base/hipblas_bindings.hip.hpp @@ -6,7 +6,6 @@ #define GKO_HIP_BASE_HIPBLAS_BINDINGS_HIP_HPP_ -#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #else @@ -18,6 +17,7 @@ #include +#include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp index 1dd772db250..9fd7ade8231 100644 --- a/hip/base/hiprand_bindings.hip.hpp +++ b/hip/base/hiprand_bindings.hip.hpp @@ -6,7 +6,6 @@ #define GKO_HIP_BASE_HIPRAND_BINDINGS_HIP_HPP_ -#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #else @@ -17,6 +16,7 @@ #include +#include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" diff --git a/hip/base/hipsparse_bindings.hip.hpp b/hip/base/hipsparse_bindings.hip.hpp index 997fc3d525f..0337f0a03c6 100644 --- a/hip/base/hipsparse_bindings.hip.hpp +++ b/hip/base/hipsparse_bindings.hip.hpp @@ -6,7 +6,6 @@ #define GKO_HIP_BASE_HIPSPARSE_BINDINGS_HIP_HPP_ -#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #else @@ -18,6 +17,7 @@ #include +#include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/hip/base/hipsparse_block_bindings.hip.hpp b/hip/base/hipsparse_block_bindings.hip.hpp index c69b0353f22..6fb70c4571c 100644 --- a/hip/base/hipsparse_block_bindings.hip.hpp +++ b/hip/base/hipsparse_block_bindings.hip.hpp @@ -6,7 +6,6 @@ #define GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_ -#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #else @@ -17,6 +16,7 @@ #include +#include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "hip/base/hipsparse_bindings.hip.hpp" diff --git a/hip/base/pointer_mode_guard.hip.hpp b/hip/base/pointer_mode_guard.hip.hpp index 2c980b113a7..5cd4b3ec58f 100644 --- a/hip/base/pointer_mode_guard.hip.hpp +++ b/hip/base/pointer_mode_guard.hip.hpp @@ -9,7 +9,6 @@ #include -#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #include @@ -24,6 +23,9 @@ #include +#include "common/cuda_hip/base/runtime.hpp" + + namespace gko { namespace kernels { namespace hip { diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index cb33cbf5df8..9ae2224c064 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -16,7 +16,6 @@ #include -#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #else @@ -28,6 +27,9 @@ #include +#include "common/cuda_hip/base/runtime.hpp" + + namespace gko { diff --git a/hip/matrix/fft_kernels.hip.cpp b/hip/matrix/fft_kernels.hip.cpp index 92358d732c7..31e180b4414 100644 --- a/hip/matrix/fft_kernels.hip.cpp +++ b/hip/matrix/fft_kernels.hip.cpp @@ -8,7 +8,6 @@ #include -#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #else @@ -21,6 +20,9 @@ #include +#include "common/cuda_hip/base/runtime.hpp" + + namespace gko { diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp index daab3a387e6..555a62d57a0 100644 --- a/hip/solver/common_trs_kernels.hip.hpp +++ b/hip/solver/common_trs_kernels.hip.hpp @@ -10,7 +10,6 @@ #include -#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #else @@ -23,6 +22,7 @@ #include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/types.hpp" #include "core/matrix/dense_kernels.hpp" diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp index 1a43b3c0151..d355940a487 100644 --- a/hip/solver/lower_trs_kernels.hip.cpp +++ b/hip/solver/lower_trs_kernels.hip.cpp @@ -8,7 +8,6 @@ #include -#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #else @@ -21,6 +20,7 @@ #include +#include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp index bcb63a26bc8..2a31e450d27 100644 --- a/hip/solver/upper_trs_kernels.hip.cpp +++ b/hip/solver/upper_trs_kernels.hip.cpp @@ -8,7 +8,6 @@ #include -#include "common/cuda_hip/base/runtime.hpp" #if HIP_VERSION >= 50200000 #include #else @@ -21,6 +20,7 @@ #include +#include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index ca20dba9007..761405c0b3d 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -1600,21 +1600,23 @@ class CudaExecutor : public detail::ExecutorBase, * * @return the cublas handle (cublasContext*) for this executor */ - cublasContext* get_cublas_handle() const { return cublas_handle_.get(); } + GKO_DEPRECATED("use get_blas_handle() instead") + cublasContext* get_cublas_handle() const { return get_blas_handle(); } /** * @copydoc get_cublas_handle() */ - cublasContext* get_blas_handle() const { return get_cublas_handle(); } + cublasContext* get_blas_handle() const { return cublas_handle_.get(); } /** * Get the cusparse handle for this executor * * @return the cusparse handle (cusparseContext*) for this executor */ + GKO_DEPRECATED("use get_sparselib_handle() instead") cusparseContext* get_cusparse_handle() const { - return cusparse_handle_.get(); + return get_sparselib_handle(); } /** @@ -1622,7 +1624,7 @@ class CudaExecutor : public detail::ExecutorBase, */ cusparseContext* get_sparselib_handle() const { - return get_cusparse_handle(); + return cusparse_handle_.get(); } /** @@ -1818,21 +1820,23 @@ class HipExecutor : public detail::ExecutorBase, * * @return the hipblas handle (hipblasContext*) for this executor */ - hipblasContext* get_hipblas_handle() const { return hipblas_handle_.get(); } + GKO_DEPRECATED("use get_blas_handle() instead") + hipblasContext* get_hipblas_handle() const { return get_blas_handle(); } /** * @copydoc get_hipblas_handle() */ - hipblasContext* get_blas_handle() const { return get_hipblas_handle(); } + hipblasContext* get_blas_handle() const { return hipblas_handle_.get(); } /** * Get the hipsparse handle for this executor * * @return the hipsparse handle (hipsparseContext*) for this executor */ + GKO_DEPRECATED("use get_sparselib_handle() instead") hipsparseContext* get_hipsparse_handle() const { - return hipsparse_handle_.get(); + return get_sparselib_handle(); } /** @@ -1840,7 +1844,7 @@ class HipExecutor : public detail::ExecutorBase, */ hipsparseContext* get_sparselib_handle() const { - return get_hipsparse_handle(); + return hipsparse_handle_.get(); } /** From 1e2d818d1ce6e43fc66916f1ea9451d9196b65ae Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 26 Jun 2024 18:13:00 +0200 Subject: [PATCH 012/448] disable cuSPARSE deprecation warnings --- cuda/base/cusparse_bindings.hpp | 17 ++++------------- cuda/base/cusparse_block_bindings.hpp | 2 ++ cuda/matrix/csr_kernels.template.cu | 2 +- hip/matrix/csr_kernels.template.hip.cpp | 2 +- 4 files changed, 8 insertions(+), 15 deletions(-) diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp index 06aaf0c6f1d..c18e1d7e9a6 100644 --- a/cuda/base/cusparse_bindings.hpp +++ b/cuda/base/cusparse_bindings.hpp @@ -940,6 +940,7 @@ inline void destroy(csrsm2Info_t info) #endif // defined(CUDA_VERSION) && (CUDA_VERSION < 11031) +GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS inline csrilu02Info_t create_ilu0_info() { csrilu02Info_t info{}; @@ -966,6 +967,7 @@ inline void destroy_ic0_info(csric02Info_t info) { GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyCsric02Info(info)); } +GKO_END_DISABLE_DEPRECATION_WARNINGS #if (defined(CUDA_VERSION) && (CUDA_VERSION < 11031)) @@ -1174,19 +1176,6 @@ void spsm_solve(cusparseHandle_t handle, cusparseOperation_t op_a, #endif // (defined(CUDA_VERSION) && (CUDA_VERSION >= 11031)) -template -void create_identity_permutation(cusparseHandle_t handle, IndexType size, - IndexType* permutation) GKO_NOT_IMPLEMENTED; - -template <> -inline void create_identity_permutation(cusparseHandle_t handle, - int32 size, int32* permutation) -{ - GKO_ASSERT_NO_CUSPARSE_ERRORS( - cusparseCreateIdentityPermutation(handle, size, permutation)); -} - - template void csrsort_buffer_size(cusparseHandle_t handle, IndexType m, IndexType n, IndexType nnz, const IndexType* row_ptrs, @@ -1264,6 +1253,7 @@ inline void gather(cusparseHandle_t handle, cusparseDnVecDescr_t in, #endif +GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS template void ilu0_buffer_size(cusparseHandle_t handle, IndexType m, IndexType nnz, const cusparseMatDescr_t descr, const ValueType* vals, @@ -1458,6 +1448,7 @@ GKO_BIND_CUSPARSE_IC0(float, cusparseScsric02); GKO_BIND_CUSPARSE_IC0(double, cusparseDcsric02); GKO_BIND_CUSPARSE_IC0(std::complex, cusparseCcsric02); GKO_BIND_CUSPARSE_IC0(std::complex, cusparseZcsric02); +GKO_END_DISABLE_DEPRECATION_WARNINGS #undef GKO_BIND_CUSPARSE_IC0 diff --git a/cuda/base/cusparse_block_bindings.hpp b/cuda/base/cusparse_block_bindings.hpp index fc64c19796c..c3db763f0da 100644 --- a/cuda/base/cusparse_block_bindings.hpp +++ b/cuda/base/cusparse_block_bindings.hpp @@ -190,6 +190,7 @@ GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32(std::complex, cusparseZgebsr2gebsc); #undef GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32 +GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS inline std::unique_ptr, std::function> create_bsr_trsm_info() @@ -457,6 +458,7 @@ GKO_BIND_CUSPARSE_BILU0(std::complex, cusparseCbsrilu02); GKO_BIND_CUSPARSE_BILU0(std::complex, cusparseZbsrilu02); #undef GKO_BIND_CUSPARSE_BILU0 +GKO_END_DISABLE_DEPRECATION_WARNINGS } // namespace cusparse diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu index 73ce267ec65..a0a7e4e97b8 100644 --- a/cuda/matrix/csr_kernels.template.cu +++ b/cuda/matrix/csr_kernels.template.cu @@ -968,7 +968,7 @@ void sort_by_column_index(std::shared_ptr exec, // init identity permutation array permutation_array(exec, nnz); auto permutation = permutation_array.get_data(); - sparselib::create_identity_permutation(handle, nnz, permutation); + components::fill_seq_array(exec, permutation, nnz); // allocate buffer size_type buffer_size{}; diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp index 31debd60a3d..8b3579f049c 100644 --- a/hip/matrix/csr_kernels.template.hip.cpp +++ b/hip/matrix/csr_kernels.template.hip.cpp @@ -772,7 +772,7 @@ void sort_by_column_index(std::shared_ptr exec, // init identity permutation array permutation_array(exec, nnz); auto permutation = permutation_array.get_data(); - sparselib::create_identity_permutation(handle, nnz, permutation); + components::fill_seq_array(exec, permutation, nnz); // allocate buffer size_type buffer_size{}; From eee11062ecfa37a8a6c5a2c1bc73f463f9984a06 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 27 Jun 2024 16:16:20 +0200 Subject: [PATCH 013/448] replace remaining usages of sparselib-specific macros --- benchmark/utils/cuda_linops.cpp | 14 +++++++------- benchmark/utils/hip_linops.hip.cpp | 6 +++--- cuda/solver/common_trs_kernels.cuh | 24 ++++++++++++------------ hip/solver/common_trs_kernels.hip.hpp | 8 ++++---- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/benchmark/utils/cuda_linops.cpp b/benchmark/utils/cuda_linops.cpp index 1222301a4cf..a404f9151ea 100644 --- a/benchmark/utils/cuda_linops.cpp +++ b/benchmark/utils/cuda_linops.cpp @@ -156,7 +156,7 @@ class CusparseCsrmp : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), - trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE) {} private: @@ -230,7 +230,7 @@ class CusparseCsr : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), - trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE) {} private: @@ -306,7 +306,7 @@ class CusparseCsrmm : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), - trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE) {} private: @@ -407,7 +407,7 @@ class CusparseCsrEx : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), - trans_(CUSPARSE_OPERATION_NON_TRANSPOSE), + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE), buffer_(exec) { algmode_ = CUSPARSE_ALG_MERGE_PATH; @@ -508,7 +508,7 @@ class CusparseHybrid CusparseHybrid(std::shared_ptr exec, const gko::dim<2>& size = gko::dim<2>{}) : gko::EnableLinOp(exec, size), - trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE) { auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateHybMat(&hyb_)); @@ -654,7 +654,7 @@ class CusparseGenericCsr : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), - trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE) {} private: @@ -745,7 +745,7 @@ class CusparseGenericCoo const gko::dim<2>& size = gko::dim<2>{}) : gko::EnableLinOp(exec, size), coo_(std::move(coo::create(exec))), - trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE) {} private: diff --git a/benchmark/utils/hip_linops.hip.cpp b/benchmark/utils/hip_linops.hip.cpp index bcbeee5ca14..f0d7edb45c3 100644 --- a/benchmark/utils/hip_linops.hip.cpp +++ b/benchmark/utils/hip_linops.hip.cpp @@ -143,7 +143,7 @@ class HipsparseCsr : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), - trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE) + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE) {} private: @@ -219,7 +219,7 @@ class HipsparseCsrmm : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), - trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE) + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE) {} private: @@ -312,7 +312,7 @@ class HipsparseHybrid HipsparseHybrid(std::shared_ptr exec, const gko::dim<2>& size = gko::dim<2>{}) : gko::EnableLinOp(exec, size), - trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE) + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE) { auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateHybMat(&hyb_)); diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 9013f9172bc..992974e95ef 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -102,14 +102,14 @@ struct CudaSolveStruct : gko::solver::SolveStruct { reinterpret_cast(0xDEAF0)); auto work_size = sparselib::spsm_buffer_size( - handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, one(), descr_a, + handle, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_NON_TRANSPOSE, one(), descr_a, descr_b, descr_c, CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr); work.resize_and_reset(work_size); - sparselib::spsm_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, + sparselib::spsm_analysis(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_NON_TRANSPOSE, one(), descr_a, descr_b, descr_c, CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr, work.get_data()); @@ -141,8 +141,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct { auto descr_c = sparselib::create_dnmat( output->get_size(), output->get_stride(), output->get_values()); - sparselib::spsm_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, + sparselib::spsm_solve(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_NON_TRANSPOSE, one(), descr_a, descr_b, descr_c, CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr); @@ -215,8 +215,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct { size_type work_size{}; sparselib::buffer_size_ext( - handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, + handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, @@ -226,8 +226,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct { work.resize_and_reset(work_size); sparselib::csrsm2_analysis( - handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, + handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, @@ -253,8 +253,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct { sparselib::pointer_mode_guard pm_guard(handle); dense::copy(exec, input, output); sparselib::csrsm2_solve( - handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], + handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], output->get_stride(), matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp index 555a62d57a0..9fac4be8547 100644 --- a/hip/solver/common_trs_kernels.hip.hpp +++ b/hip/solver/common_trs_kernels.hip.hpp @@ -125,7 +125,7 @@ void generate_kernel(std::shared_ptr exec, { sparselib::pointer_mode_guard pm_guard(handle); sparselib::csrsv2_buffer_size( - handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + handle, SPARSELIB_OPERATION_NON_TRANSPOSE, matrix->get_size()[0], matrix->get_num_stored_elements(), hip_solve_struct->factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), @@ -140,7 +140,7 @@ void generate_kernel(std::shared_ptr exec, exec->alloc(hip_solve_struct->factor_work_size); sparselib::csrsv2_analysis( - handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + handle, SPARSELIB_OPERATION_NON_TRANSPOSE, matrix->get_size()[0], matrix->get_num_stored_elements(), hip_solve_struct->factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), @@ -180,7 +180,7 @@ void solve_kernel(std::shared_ptr exec, sparselib::pointer_mode_guard pm_guard(handle); if (b->get_stride() == 1) { sparselib::csrsv2_solve( - handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + handle, SPARSELIB_OPERATION_NON_TRANSPOSE, matrix->get_size()[0], matrix->get_num_stored_elements(), &one, hip_solve_struct->factor_descr, @@ -195,7 +195,7 @@ void solve_kernel(std::shared_ptr exec, dense::transpose(exec, x, trans_x); for (IndexType i = 0; i < trans_b->get_size()[0]; i++) { sparselib::csrsv2_solve( - handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + handle, SPARSELIB_OPERATION_NON_TRANSPOSE, matrix->get_size()[0], matrix->get_num_stored_elements(), &one, hip_solve_struct->factor_descr, From ce09e815e451cf96e423766b7ff4ac1341fa5750 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 15 Nov 2023 15:15:57 +0000 Subject: [PATCH 014/448] adds script to change main include to use "" instead of <> TODO: remove or revert this commit --- dev_tools/scripts/change-main-include.py | 60 ++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100755 dev_tools/scripts/change-main-include.py diff --git a/dev_tools/scripts/change-main-include.py b/dev_tools/scripts/change-main-include.py new file mode 100755 index 00000000000..7ee5e8cd922 --- /dev/null +++ b/dev_tools/scripts/change-main-include.py @@ -0,0 +1,60 @@ +#! /usr/bin/env python3 +import collections +import sys +import re + +files = sys.argv[1:] + +test_subdirectories = [ + "base", "config", "distributed", "factorization", + "log", "matrix", "multigrid", "preconditioner", + "reorder", "solver", "stop", "synthesizer" +] + +false_positives = [ + "test/utils/executor.hpp", + "test/utils/mpi/executor.hpp" +] + + +for filename in files: + suffix = re.compile(r"(\.cpp|\.cu|\.inc)$") + main_include_re = re.compile(r"#include\s+]+)>") + + Match = collections.namedtuple("Match", ["idx", "line"]) + + if not suffix.search(filename): + continue + + if any(f"test/{subdir}" in filename for subdir in test_subdirectories): + continue + + if any(filename.endswith(fp) for fp in false_positives): + continue + + with open(filename, 'r') as file: + content = file.readlines() + + try: + first_include = next(Match(idx=i, line=l) for i, l in enumerate(content) if l.startswith("#include")) + except: + first_include = Match(idx=-1, line="") + if "', '"') + with open(filename, 'w') as file: + file.writelines(content) From 88f2197af98e705fa85aa3f1bbec1885b7377e31 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 28 Jun 2024 09:12:51 +0200 Subject: [PATCH 015/448] automatically change the 'main' include changes from: `./dev_tools/scripts/change-main-include.py $(git ls-files)` This is done based on the heuristic that the first include is the main include, except in some cases. Here are the cases where the heuristic can't determine either way: benchmark/utils/mpi_timer.cpp benchmark/utils/tuning_variables.cpp core/base/mpi.cpp core/base/segmented_array.cpp core/base/version.cpp core/config/multigrid_config.cpp core/distributed/index_map.cpp core/distributed/partition.cpp core/log/logger.cpp core/preconditioner/batch_jacobi.cpp core/stop/combined.cpp core/stop/criterion.cpp core/stop/iteration.cpp core/stop/time.cpp cuda/base/version.cpp devices/cuda/executor.cpp devices/hip/executor.cpp dpcpp/base/version.dp.cpp hip/base/version.hip.cpp omp/base/version.cpp reference/base/version.cpp --- core/base/array.cpp | 2 +- core/base/batch_multi_vector.cpp | 2 +- core/base/block_operator.cpp | 2 +- core/base/combination.cpp | 2 +- core/base/composition.cpp | 2 +- core/base/dense_cache.cpp | 2 +- core/base/device_matrix_data.cpp | 2 +- core/base/executor.cpp | 2 +- core/base/index_set.cpp | 2 +- core/base/memory.cpp | 2 +- core/base/mtx_io.cpp | 2 +- core/base/perturbation.cpp | 2 +- core/base/timer.cpp | 2 +- core/config/config.cpp | 2 +- core/config/property_tree.cpp | 2 +- core/config/registry.cpp | 2 +- core/config/type_descriptor.cpp | 2 +- core/distributed/matrix.cpp | 2 +- core/distributed/partition_helpers.cpp | 2 +- core/distributed/preconditioner/schwarz.cpp | 2 +- core/distributed/vector.cpp | 2 +- core/factorization/cholesky.cpp | 2 +- core/factorization/factorization.cpp | 2 +- core/factorization/ic.cpp | 2 +- core/factorization/ilu.cpp | 2 +- core/factorization/lu.cpp | 2 +- core/factorization/par_ic.cpp | 2 +- core/factorization/par_ict.cpp | 2 +- core/factorization/par_ilu.cpp | 2 +- core/factorization/par_ilut.cpp | 2 +- core/log/batch_logger.cpp | 2 +- core/log/convergence.cpp | 2 +- core/log/papi.cpp | 2 +- core/log/performance_hint.cpp | 2 +- core/log/profiler_hook.cpp | 2 +- core/log/record.cpp | 2 +- core/log/stream.cpp | 2 +- core/matrix/batch_csr.cpp | 2 +- core/matrix/batch_dense.cpp | 2 +- core/matrix/batch_ell.cpp | 2 +- core/matrix/batch_identity.cpp | 2 +- core/matrix/coo.cpp | 2 +- core/matrix/csr.cpp | 2 +- core/matrix/dense.cpp | 2 +- core/matrix/diagonal.cpp | 2 +- core/matrix/ell.cpp | 2 +- core/matrix/fbcsr.cpp | 2 +- core/matrix/fft.cpp | 2 +- core/matrix/hybrid.cpp | 2 +- core/matrix/identity.cpp | 2 +- core/matrix/permutation.cpp | 2 +- core/matrix/row_gatherer.cpp | 2 +- core/matrix/scaled_permutation.cpp | 2 +- core/matrix/sellp.cpp | 2 +- core/matrix/sparsity_csr.cpp | 2 +- core/multigrid/fixed_coarsening.cpp | 2 +- core/multigrid/pgm.cpp | 2 +- core/preconditioner/ic.cpp | 2 +- core/preconditioner/ilu.cpp | 2 +- core/preconditioner/isai.cpp | 2 +- core/preconditioner/jacobi.cpp | 2 +- core/reorder/amd.cpp | 2 +- core/reorder/mc64.cpp | 2 +- core/reorder/nested_dissection.cpp | 2 +- core/reorder/rcm.cpp | 2 +- core/reorder/scaled_reordered.cpp | 2 +- core/solver/batch_bicgstab.cpp | 2 +- core/solver/batch_cg.cpp | 2 +- core/solver/bicg.cpp | 2 +- core/solver/bicgstab.cpp | 2 +- core/solver/cb_gmres.cpp | 2 +- core/solver/cg.cpp | 2 +- core/solver/cgs.cpp | 2 +- core/solver/direct.cpp | 2 +- core/solver/fcg.cpp | 2 +- core/solver/gcr.cpp | 2 +- core/solver/gmres.cpp | 2 +- core/solver/idr.cpp | 2 +- core/solver/ir.cpp | 2 +- core/solver/multigrid.cpp | 2 +- core/stop/residual_norm.cpp | 2 +- cuda/base/exception.cpp | 2 +- cuda/base/executor.cpp | 2 +- cuda/base/memory.cpp | 2 +- cuda/base/stream.cpp | 2 +- cuda/base/timer.cpp | 2 +- devices/dpcpp/executor.cpp | 2 +- devices/omp/executor.cpp | 2 +- dpcpp/base/executor.dp.cpp | 2 +- dpcpp/base/timer.dp.cpp | 2 +- hip/base/device.hip.cpp | 2 +- hip/base/exception.hip.cpp | 2 +- hip/base/executor.hip.cpp | 2 +- hip/base/memory.hip.cpp | 2 +- hip/base/stream.hip.cpp | 2 +- hip/base/timer.hip.cpp | 2 +- omp/base/executor.cpp | 2 +- 97 files changed, 97 insertions(+), 97 deletions(-) diff --git a/core/base/array.cpp b/core/base/array.cpp index 44142e0fa2d..f529e3cf9d2 100644 --- a/core/base/array.cpp +++ b/core/base/array.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/array.hpp" #include diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index c4ec023e323..960158654f2 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/batch_multi_vector.hpp" #include diff --git a/core/base/block_operator.cpp b/core/base/block_operator.cpp index b8190bad02d..43ac79c3c0e 100644 --- a/core/base/block_operator.cpp +++ b/core/base/block_operator.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/block_operator.hpp" #include diff --git a/core/base/combination.cpp b/core/base/combination.cpp index 01d1d197820..324fa8d4ddf 100644 --- a/core/base/combination.cpp +++ b/core/base/combination.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/combination.hpp" #include diff --git a/core/base/composition.cpp b/core/base/composition.cpp index cf3789c45a7..515fb425633 100644 --- a/core/base/composition.cpp +++ b/core/base/composition.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/composition.hpp" #include diff --git a/core/base/dense_cache.cpp b/core/base/dense_cache.cpp index e321b38b442..50e1abc3977 100644 --- a/core/base/dense_cache.cpp +++ b/core/base/dense_cache.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/dense_cache.hpp" #include diff --git a/core/base/device_matrix_data.cpp b/core/base/device_matrix_data.cpp index 4190ee4f6d0..085054cbd69 100644 --- a/core/base/device_matrix_data.cpp +++ b/core/base/device_matrix_data.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/device_matrix_data.hpp" #include diff --git a/core/base/executor.cpp b/core/base/executor.cpp index a0efdc2291e..1fb1703c56f 100644 --- a/core/base/executor.cpp +++ b/core/base/executor.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/executor.hpp" #include diff --git a/core/base/index_set.cpp b/core/base/index_set.cpp index c40f57586b8..b27d3803448 100644 --- a/core/base/index_set.cpp +++ b/core/base/index_set.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/index_set.hpp" #include diff --git a/core/base/memory.cpp b/core/base/memory.cpp index be3b231dedd..0b3e0ce833b 100644 --- a/core/base/memory.cpp +++ b/core/base/memory.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/memory.hpp" #include diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp index a8208593096..e2f2dbf5d9b 100644 --- a/core/base/mtx_io.cpp +++ b/core/base/mtx_io.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/mtx_io.hpp" #include diff --git a/core/base/perturbation.cpp b/core/base/perturbation.cpp index 89b7f9d67fd..94a4975cfa0 100644 --- a/core/base/perturbation.cpp +++ b/core/base/perturbation.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/perturbation.hpp" #include diff --git a/core/base/timer.cpp b/core/base/timer.cpp index eb060d1bbce..abd5fbf61cd 100644 --- a/core/base/timer.cpp +++ b/core/base/timer.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/timer.hpp" #include diff --git a/core/config/config.cpp b/core/config/config.cpp index 291c7cab41c..87dd49b6c03 100644 --- a/core/config/config.cpp +++ b/core/config/config.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/config/config.hpp" #include diff --git a/core/config/property_tree.cpp b/core/config/property_tree.cpp index 47e627d21e6..1ab33712953 100644 --- a/core/config/property_tree.cpp +++ b/core/config/property_tree.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/config/property_tree.hpp" #include diff --git a/core/config/registry.cpp b/core/config/registry.cpp index 8ff619b4250..8b8bdbcaf0d 100644 --- a/core/config/registry.cpp +++ b/core/config/registry.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/config/registry.hpp" #include diff --git a/core/config/type_descriptor.cpp b/core/config/type_descriptor.cpp index c2885407cad..cbc29c5088a 100644 --- a/core/config/type_descriptor.cpp +++ b/core/config/type_descriptor.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/config/type_descriptor.hpp" #include diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 1dcddbd1a6a..2d2d1304769 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/distributed/matrix.hpp" #include diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index 50b9bee0e5f..1a55daf8134 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/distributed/partition_helpers.hpp" #include diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp index 3bf61ac43d0..2def0a0f85c 100644 --- a/core/distributed/preconditioner/schwarz.cpp +++ b/core/distributed/preconditioner/schwarz.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/distributed/preconditioner/schwarz.hpp" #include diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index 52993faa4bd..2e57fcf7451 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/distributed/vector.hpp" #include diff --git a/core/factorization/cholesky.cpp b/core/factorization/cholesky.cpp index 63bbde4f2fd..12456df4abc 100644 --- a/core/factorization/cholesky.cpp +++ b/core/factorization/cholesky.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/factorization/cholesky.hpp" #include diff --git a/core/factorization/factorization.cpp b/core/factorization/factorization.cpp index 00cdd12648d..597fc7b48f4 100644 --- a/core/factorization/factorization.cpp +++ b/core/factorization/factorization.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/factorization/factorization.hpp" #include diff --git a/core/factorization/ic.cpp b/core/factorization/ic.cpp index 763a6364d09..67fb3df5b46 100644 --- a/core/factorization/ic.cpp +++ b/core/factorization/ic.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/factorization/ic.hpp" #include diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp index 5ae4ccb9654..15f3cef1831 100644 --- a/core/factorization/ilu.cpp +++ b/core/factorization/ilu.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/factorization/ilu.hpp" #include diff --git a/core/factorization/lu.cpp b/core/factorization/lu.cpp index e0da4ceb429..8ab1ddfc37f 100644 --- a/core/factorization/lu.cpp +++ b/core/factorization/lu.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/factorization/lu.hpp" #include diff --git a/core/factorization/par_ic.cpp b/core/factorization/par_ic.cpp index 3bd415257f7..c21f66934aa 100644 --- a/core/factorization/par_ic.cpp +++ b/core/factorization/par_ic.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/factorization/par_ic.hpp" #include diff --git a/core/factorization/par_ict.cpp b/core/factorization/par_ict.cpp index 8a7f8297f7e..54176d79545 100644 --- a/core/factorization/par_ict.cpp +++ b/core/factorization/par_ict.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/factorization/par_ict.hpp" #include diff --git a/core/factorization/par_ilu.cpp b/core/factorization/par_ilu.cpp index 963b085d76f..f69947adcac 100644 --- a/core/factorization/par_ilu.cpp +++ b/core/factorization/par_ilu.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/factorization/par_ilu.hpp" #include diff --git a/core/factorization/par_ilut.cpp b/core/factorization/par_ilut.cpp index da45642490b..ff4b5b2a83e 100644 --- a/core/factorization/par_ilut.cpp +++ b/core/factorization/par_ilut.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/factorization/par_ilut.hpp" #include diff --git a/core/log/batch_logger.cpp b/core/log/batch_logger.cpp index e18ecd2d5e9..532cae64c28 100644 --- a/core/log/batch_logger.cpp +++ b/core/log/batch_logger.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/log/batch_logger.hpp" #include diff --git a/core/log/convergence.cpp b/core/log/convergence.cpp index 51dc3cc32c8..16c89e08ffc 100644 --- a/core/log/convergence.cpp +++ b/core/log/convergence.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/log/convergence.hpp" #include diff --git a/core/log/papi.cpp b/core/log/papi.cpp index ce23eb8ee29..83a9bd3b93c 100644 --- a/core/log/papi.cpp +++ b/core/log/papi.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/log/papi.hpp" #include diff --git a/core/log/performance_hint.cpp b/core/log/performance_hint.cpp index fb06fdf4be8..3b0a720aa93 100644 --- a/core/log/performance_hint.cpp +++ b/core/log/performance_hint.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/log/performance_hint.hpp" #include diff --git a/core/log/profiler_hook.cpp b/core/log/profiler_hook.cpp index a8eef7668f2..87ea8f42d02 100644 --- a/core/log/profiler_hook.cpp +++ b/core/log/profiler_hook.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/log/profiler_hook.hpp" #include diff --git a/core/log/record.cpp b/core/log/record.cpp index f58f6747ff5..6d995cd348c 100644 --- a/core/log/record.cpp +++ b/core/log/record.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/log/record.hpp" #include diff --git a/core/log/stream.cpp b/core/log/stream.cpp index c02ecc77b09..033575c9b54 100644 --- a/core/log/stream.cpp +++ b/core/log/stream.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/log/stream.hpp" #include diff --git a/core/matrix/batch_csr.cpp b/core/matrix/batch_csr.cpp index 96301f3e97b..8e4b1434f8e 100644 --- a/core/matrix/batch_csr.cpp +++ b/core/matrix/batch_csr.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/matrix/batch_csr.hpp" #include diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp index ea6a19aa21c..a2eb017cf7c 100644 --- a/core/matrix/batch_dense.cpp +++ b/core/matrix/batch_dense.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/matrix/batch_dense.hpp" #include diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp index 0db9640b406..5c3da632643 100644 --- a/core/matrix/batch_ell.cpp +++ b/core/matrix/batch_ell.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/matrix/batch_ell.hpp" #include diff --git a/core/matrix/batch_identity.cpp b/core/matrix/batch_identity.cpp index 0de3101a62a..480f0a10474 100644 --- a/core/matrix/batch_identity.cpp +++ b/core/matrix/batch_identity.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/matrix/batch_identity.hpp" #include diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index 9530dbd2624..eb8b33c0cf1 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/matrix/coo.hpp" #include diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index a84298b6f95..8dad86568fb 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/matrix/csr.hpp" #include diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index d1f3da8e166..eb52c574db9 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/matrix/dense.hpp" #include diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp index b6fe45dd5d0..08b1e00e340 100644 --- a/core/matrix/diagonal.cpp +++ b/core/matrix/diagonal.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/matrix/diagonal.hpp" #include diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index ec51627e058..f6433fe156a 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/matrix/ell.hpp" #include diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index 8141853a229..1ea00d741bd 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/matrix/fbcsr.hpp" #include diff --git a/core/matrix/fft.cpp b/core/matrix/fft.cpp index f86e8b94cf0..1ec69ce3338 100644 --- a/core/matrix/fft.cpp +++ b/core/matrix/fft.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/matrix/fft.hpp" #include diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp index 920d5b39ed1..c30c60ce0fb 100644 --- a/core/matrix/hybrid.cpp +++ b/core/matrix/hybrid.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/matrix/hybrid.hpp" #include diff --git a/core/matrix/identity.cpp b/core/matrix/identity.cpp index 5f264ad2a6d..a58601f31f0 100644 --- a/core/matrix/identity.cpp +++ b/core/matrix/identity.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/matrix/identity.hpp" #include diff --git a/core/matrix/permutation.cpp b/core/matrix/permutation.cpp index 84989c7eddb..76f5d7c8005 100644 --- a/core/matrix/permutation.cpp +++ b/core/matrix/permutation.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/matrix/permutation.hpp" #include diff --git a/core/matrix/row_gatherer.cpp b/core/matrix/row_gatherer.cpp index 0f570cda410..72a6cbe2808 100644 --- a/core/matrix/row_gatherer.cpp +++ b/core/matrix/row_gatherer.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/matrix/row_gatherer.hpp" #include diff --git a/core/matrix/scaled_permutation.cpp b/core/matrix/scaled_permutation.cpp index ff1246a1299..c948c6071ad 100644 --- a/core/matrix/scaled_permutation.cpp +++ b/core/matrix/scaled_permutation.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/matrix/scaled_permutation.hpp" #include diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp index 636fc3907ae..39e2c706b19 100644 --- a/core/matrix/sellp.cpp +++ b/core/matrix/sellp.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/matrix/sellp.hpp" #include diff --git a/core/matrix/sparsity_csr.cpp b/core/matrix/sparsity_csr.cpp index ed66ad09efb..2ec463613b0 100644 --- a/core/matrix/sparsity_csr.cpp +++ b/core/matrix/sparsity_csr.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/matrix/sparsity_csr.hpp" #include diff --git a/core/multigrid/fixed_coarsening.cpp b/core/multigrid/fixed_coarsening.cpp index 413614abf28..e7024d334ad 100644 --- a/core/multigrid/fixed_coarsening.cpp +++ b/core/multigrid/fixed_coarsening.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/multigrid/fixed_coarsening.hpp" #include diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp index d60835ca944..f0393794d94 100644 --- a/core/multigrid/pgm.cpp +++ b/core/multigrid/pgm.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/multigrid/pgm.hpp" #include diff --git a/core/preconditioner/ic.cpp b/core/preconditioner/ic.cpp index c4613d30ea6..37eb0cb5b3f 100644 --- a/core/preconditioner/ic.cpp +++ b/core/preconditioner/ic.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/preconditioner/ic.hpp" #include diff --git a/core/preconditioner/ilu.cpp b/core/preconditioner/ilu.cpp index 652ade0152c..00422300172 100644 --- a/core/preconditioner/ilu.cpp +++ b/core/preconditioner/ilu.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/preconditioner/ilu.hpp" #include diff --git a/core/preconditioner/isai.cpp b/core/preconditioner/isai.cpp index b10eec36691..f825e2f5c82 100644 --- a/core/preconditioner/isai.cpp +++ b/core/preconditioner/isai.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/preconditioner/isai.hpp" #include diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp index b0e8224d06f..8081f31712a 100644 --- a/core/preconditioner/jacobi.cpp +++ b/core/preconditioner/jacobi.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/preconditioner/jacobi.hpp" #include diff --git a/core/reorder/amd.cpp b/core/reorder/amd.cpp index 1b3198b248f..7cb24c39ea0 100644 --- a/core/reorder/amd.cpp +++ b/core/reorder/amd.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/reorder/amd.hpp" #include diff --git a/core/reorder/mc64.cpp b/core/reorder/mc64.cpp index 1d4ad438a59..e47969c0b71 100644 --- a/core/reorder/mc64.cpp +++ b/core/reorder/mc64.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/reorder/mc64.hpp" #include diff --git a/core/reorder/nested_dissection.cpp b/core/reorder/nested_dissection.cpp index f609a15653c..bf9c8ba7a3d 100644 --- a/core/reorder/nested_dissection.cpp +++ b/core/reorder/nested_dissection.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/reorder/nested_dissection.hpp" #include diff --git a/core/reorder/rcm.cpp b/core/reorder/rcm.cpp index 5be8409ba79..f3a16cc92a6 100644 --- a/core/reorder/rcm.cpp +++ b/core/reorder/rcm.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/reorder/rcm.hpp" #include diff --git a/core/reorder/scaled_reordered.cpp b/core/reorder/scaled_reordered.cpp index 8ee0035101d..cf246ea3194 100644 --- a/core/reorder/scaled_reordered.cpp +++ b/core/reorder/scaled_reordered.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/reorder/scaled_reordered.hpp" #include diff --git a/core/solver/batch_bicgstab.cpp b/core/solver/batch_bicgstab.cpp index f322e042d27..9621f058097 100644 --- a/core/solver/batch_bicgstab.cpp +++ b/core/solver/batch_bicgstab.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/solver/batch_bicgstab.hpp" #include diff --git a/core/solver/batch_cg.cpp b/core/solver/batch_cg.cpp index 3efe95406e0..d2fe4a5f00d 100644 --- a/core/solver/batch_cg.cpp +++ b/core/solver/batch_cg.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/solver/batch_cg.hpp" #include diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp index b5831c33ada..51ba251aecd 100644 --- a/core/solver/bicg.cpp +++ b/core/solver/bicg.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/solver/bicg.hpp" #include diff --git a/core/solver/bicgstab.cpp b/core/solver/bicgstab.cpp index c6ae33918a1..e1f2f1cb77e 100644 --- a/core/solver/bicgstab.cpp +++ b/core/solver/bicgstab.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/solver/bicgstab.hpp" #include diff --git a/core/solver/cb_gmres.cpp b/core/solver/cb_gmres.cpp index bb888d660e4..812c6c222ce 100644 --- a/core/solver/cb_gmres.cpp +++ b/core/solver/cb_gmres.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/solver/cb_gmres.hpp" #include diff --git a/core/solver/cg.cpp b/core/solver/cg.cpp index f83faf7e20f..a8e534588a0 100644 --- a/core/solver/cg.cpp +++ b/core/solver/cg.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/solver/cg.hpp" #include diff --git a/core/solver/cgs.cpp b/core/solver/cgs.cpp index 6bb41338f77..9d6a575fdbf 100644 --- a/core/solver/cgs.cpp +++ b/core/solver/cgs.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/solver/cgs.hpp" #include diff --git a/core/solver/direct.cpp b/core/solver/direct.cpp index d540aa584f0..717fd71698f 100644 --- a/core/solver/direct.cpp +++ b/core/solver/direct.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/solver/direct.hpp" #include diff --git a/core/solver/fcg.cpp b/core/solver/fcg.cpp index 5966664c14d..dee37467c46 100644 --- a/core/solver/fcg.cpp +++ b/core/solver/fcg.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/solver/fcg.hpp" #include diff --git a/core/solver/gcr.cpp b/core/solver/gcr.cpp index 24fb36aa42b..cb2b55a3460 100644 --- a/core/solver/gcr.cpp +++ b/core/solver/gcr.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/solver/gcr.hpp" #include diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index b261cf754eb..b0ad6baa01e 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/solver/gmres.hpp" #include diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp index 9085876a85a..4bc56562d3b 100644 --- a/core/solver/idr.cpp +++ b/core/solver/idr.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/solver/idr.hpp" #include diff --git a/core/solver/ir.cpp b/core/solver/ir.cpp index 16152dc63e9..3a6b0b1d2d0 100644 --- a/core/solver/ir.cpp +++ b/core/solver/ir.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/solver/ir.hpp" #include diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index 6dd06747883..d7fc1d3c997 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/solver/multigrid.hpp" #include diff --git a/core/stop/residual_norm.cpp b/core/stop/residual_norm.cpp index 44853670359..824ab87ec0f 100644 --- a/core/stop/residual_norm.cpp +++ b/core/stop/residual_norm.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/stop/residual_norm.hpp" #include diff --git a/cuda/base/exception.cpp b/cuda/base/exception.cpp index 24b5de36c6a..13557e3da50 100644 --- a/cuda/base/exception.cpp +++ b/cuda/base/exception.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/exception.hpp" #include diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index 3d1dbf7c92c..c41bc6a72c6 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/executor.hpp" #include diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp index 5f36489744a..7949b07f78f 100644 --- a/cuda/base/memory.cpp +++ b/cuda/base/memory.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/memory.hpp" #include diff --git a/cuda/base/stream.cpp b/cuda/base/stream.cpp index f0d8086398c..703c9958ecd 100644 --- a/cuda/base/stream.cpp +++ b/cuda/base/stream.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/stream.hpp" #include diff --git a/cuda/base/timer.cpp b/cuda/base/timer.cpp index 35759f82dd4..01b96c19536 100644 --- a/cuda/base/timer.cpp +++ b/cuda/base/timer.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/timer.hpp" #include diff --git a/devices/dpcpp/executor.cpp b/devices/dpcpp/executor.cpp index 323e9efeca6..435d9426374 100644 --- a/devices/dpcpp/executor.cpp +++ b/devices/dpcpp/executor.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/executor.hpp" #include diff --git a/devices/omp/executor.cpp b/devices/omp/executor.cpp index db3058c8371..448d7b68d63 100644 --- a/devices/omp/executor.cpp +++ b/devices/omp/executor.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/executor.hpp" #include diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp index 58eeec10d17..159ee7eb533 100644 --- a/dpcpp/base/executor.dp.cpp +++ b/dpcpp/base/executor.dp.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/executor.hpp" #include diff --git a/dpcpp/base/timer.dp.cpp b/dpcpp/base/timer.dp.cpp index e14ef40a439..da347b14ddf 100644 --- a/dpcpp/base/timer.dp.cpp +++ b/dpcpp/base/timer.dp.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/timer.hpp" #include diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp index be897510056..d1d4325c6f1 100644 --- a/hip/base/device.hip.cpp +++ b/hip/base/device.hip.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/device.hpp" #include diff --git a/hip/base/exception.hip.cpp b/hip/base/exception.hip.cpp index f0e17f4e873..05b030ad375 100644 --- a/hip/base/exception.hip.cpp +++ b/hip/base/exception.hip.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/exception.hpp" #include diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp index 4b5ce7afa7b..e371e48f489 100644 --- a/hip/base/executor.hip.cpp +++ b/hip/base/executor.hip.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/executor.hpp" #include diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp index 5fde8f518c6..27d510d784b 100644 --- a/hip/base/memory.hip.cpp +++ b/hip/base/memory.hip.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/memory.hpp" #include diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp index b56c5104428..d5acb978e22 100644 --- a/hip/base/stream.hip.cpp +++ b/hip/base/stream.hip.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/stream.hpp" #include diff --git a/hip/base/timer.hip.cpp b/hip/base/timer.hip.cpp index bd81d9f3be5..67a9a8153b6 100644 --- a/hip/base/timer.hip.cpp +++ b/hip/base/timer.hip.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/timer.hpp" #include diff --git a/omp/base/executor.cpp b/omp/base/executor.cpp index 7d969eb89f8..98ef2d528ae 100644 --- a/omp/base/executor.cpp +++ b/omp/base/executor.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/executor.hpp" #include From 2e227f70de4bdba33f0ab33caa15be53efb8f371 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 28 Jun 2024 09:25:38 +0200 Subject: [PATCH 016/448] manually changed main header --- core/base/version.cpp | 2 +- core/config/multigrid_config.cpp | 2 +- core/distributed/index_map.cpp | 2 +- core/distributed/partition.cpp | 2 +- core/log/logger.cpp | 2 +- core/preconditioner/batch_jacobi.cpp | 2 +- core/stop/combined.cpp | 2 +- core/stop/criterion.cpp | 2 +- core/stop/iteration.cpp | 2 +- core/stop/time.cpp | 2 +- cuda/base/version.cpp | 2 +- devices/cuda/executor.cpp | 2 +- devices/hip/executor.cpp | 2 +- dpcpp/base/version.dp.cpp | 2 +- hip/base/version.hip.cpp | 2 +- omp/base/version.cpp | 2 +- reference/base/version.cpp | 2 +- 17 files changed, 17 insertions(+), 17 deletions(-) diff --git a/core/base/version.cpp b/core/base/version.cpp index a7802a890dd..54f59eb7356 100644 --- a/core/base/version.cpp +++ b/core/base/version.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/version.hpp" namespace gko { diff --git a/core/config/multigrid_config.cpp b/core/config/multigrid_config.cpp index 6eb9f5ed872..553e6ca033d 100644 --- a/core/config/multigrid_config.cpp +++ b/core/config/multigrid_config.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/multigrid/pgm.hpp" #include "core/config/parse_macro.hpp" diff --git a/core/distributed/index_map.cpp b/core/distributed/index_map.cpp index 5d2a1aebe18..e24d8141b4d 100644 --- a/core/distributed/index_map.cpp +++ b/core/distributed/index_map.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/distributed/index_map.hpp" #include "core/distributed/index_map_kernels.hpp" diff --git a/core/distributed/partition.cpp b/core/distributed/partition.cpp index 8a2fefcad79..5e6903de872 100644 --- a/core/distributed/partition.cpp +++ b/core/distributed/partition.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/distributed/partition.hpp" #include "core/base/array_access.hpp" diff --git a/core/log/logger.cpp b/core/log/logger.cpp index f3d89a4657f..e141f1816dc 100644 --- a/core/log/logger.cpp +++ b/core/log/logger.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/log/logger.hpp" namespace gko { diff --git a/core/preconditioner/batch_jacobi.cpp b/core/preconditioner/batch_jacobi.cpp index 7d6ffa66848..3f18a32123f 100644 --- a/core/preconditioner/batch_jacobi.cpp +++ b/core/preconditioner/batch_jacobi.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/preconditioner/batch_jacobi.hpp" #include "core/matrix/batch_csr_kernels.hpp" diff --git a/core/stop/combined.cpp b/core/stop/combined.cpp index 594b9214c08..d29d65f73bc 100644 --- a/core/stop/combined.cpp +++ b/core/stop/combined.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/stop/combined.hpp" namespace gko { diff --git a/core/stop/criterion.cpp b/core/stop/criterion.cpp index c907e4e03cd..02f04876f9f 100644 --- a/core/stop/criterion.cpp +++ b/core/stop/criterion.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/stop/criterion.hpp" #include "core/stop/criterion_kernels.hpp" diff --git a/core/stop/iteration.cpp b/core/stop/iteration.cpp index 9e54a2c6384..2f712865eda 100644 --- a/core/stop/iteration.cpp +++ b/core/stop/iteration.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/stop/iteration.hpp" namespace gko { diff --git a/core/stop/time.cpp b/core/stop/time.cpp index 5ff50c24b07..0481b9c91d3 100644 --- a/core/stop/time.cpp +++ b/core/stop/time.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/stop/time.hpp" namespace gko { diff --git a/cuda/base/version.cpp b/cuda/base/version.cpp index d6e4b9b1068..0b95067a1c8 100644 --- a/cuda/base/version.cpp +++ b/cuda/base/version.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/version.hpp" namespace gko { diff --git a/devices/cuda/executor.cpp b/devices/cuda/executor.cpp index 58261c318fb..ff17a9ba8cd 100644 --- a/devices/cuda/executor.cpp +++ b/devices/cuda/executor.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/executor.hpp" namespace gko { diff --git a/devices/hip/executor.cpp b/devices/hip/executor.cpp index 6954e31b24b..82001d667db 100644 --- a/devices/hip/executor.cpp +++ b/devices/hip/executor.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/executor.hpp" namespace gko { diff --git a/dpcpp/base/version.dp.cpp b/dpcpp/base/version.dp.cpp index f53a6d2820c..6c6f9371d01 100644 --- a/dpcpp/base/version.dp.cpp +++ b/dpcpp/base/version.dp.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/version.hpp" namespace gko { diff --git a/hip/base/version.hip.cpp b/hip/base/version.hip.cpp index f2490fa691c..512e5ca6f1a 100644 --- a/hip/base/version.hip.cpp +++ b/hip/base/version.hip.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/version.hpp" namespace gko { diff --git a/omp/base/version.cpp b/omp/base/version.cpp index e96bfe5b0a4..dbca513323a 100644 --- a/omp/base/version.cpp +++ b/omp/base/version.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/version.hpp" namespace gko { diff --git a/reference/base/version.cpp b/reference/base/version.cpp index 04e44ee1848..74697ff70ab 100644 --- a/reference/base/version.cpp +++ b/reference/base/version.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "ginkgo/core/base/version.hpp" namespace gko { From a20456bb7af148b675c6fd7f1d7d79c637a7fe22 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 16 Nov 2023 16:33:40 +0100 Subject: [PATCH 017/448] replace force-top with clang-format on/off --- core/test/log/logger.cpp | 4 ++-- core/test/matrix/identity.cpp | 4 ++-- cuda/test/base/scoped_device_id.cu | 4 ++-- dpcpp/base/device_matrix_data_kernels.dp.cpp | 4 ++-- dpcpp/base/onedpl.hpp | 6 ++---- dpcpp/distributed/partition_helpers_kernels.dp.cpp | 5 +++-- dpcpp/distributed/partition_kernels.dp.cpp | 5 +++-- dpcpp/multigrid/pgm_kernels.dp.cpp | 5 +++-- hip/factorization/par_ilut_select_common.hip.cpp | 5 +++-- hip/test/base/hip_executor.hip.cpp | 5 +++-- hip/test/base/hip_executor_topology.hip.cpp | 5 +++-- hip/test/base/math.hip.cpp | 5 +++-- hip/test/base/scoped_device_id.hip.cpp | 5 +++-- hip/test/components/cooperative_groups.hip.cpp | 5 +++-- hip/test/components/merging.hip.cpp | 5 +++-- hip/test/components/searching.hip.cpp | 5 +++-- 16 files changed, 43 insertions(+), 34 deletions(-) diff --git a/core/test/log/logger.cpp b/core/test/log/logger.cpp index 90330dbd1d0..18315442559 100644 --- a/core/test/log/logger.cpp +++ b/core/test/log/logger.cpp @@ -2,10 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -// force-top: on +// clang-format off #include GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS -// force-top: off +// clang-format on #include diff --git a/core/test/matrix/identity.cpp b/core/test/matrix/identity.cpp index 28e0b0682e1..69370df07c5 100644 --- a/core/test/matrix/identity.cpp +++ b/core/test/matrix/identity.cpp @@ -2,9 +2,9 @@ // // SPDX-License-Identifier: BSD-3-Clause -// force-top: on +// clang-format off #include -// force-top: off +// clang-format on #include diff --git a/cuda/test/base/scoped_device_id.cu b/cuda/test/base/scoped_device_id.cu index 2b2eb58db49..4abd8f5810b 100644 --- a/cuda/test/base/scoped_device_id.cu +++ b/cuda/test/base/scoped_device_id.cu @@ -2,10 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -// force-top: on +// clang-format off // prevent compilation failure related to disappearing assert(...) statements #include -// force-top: off +// clang-format on #include diff --git a/dpcpp/base/device_matrix_data_kernels.dp.cpp b/dpcpp/base/device_matrix_data_kernels.dp.cpp index 9779ba576fd..5869c853385 100644 --- a/dpcpp/base/device_matrix_data_kernels.dp.cpp +++ b/dpcpp/base/device_matrix_data_kernels.dp.cpp @@ -2,10 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -// force-top: on +// clang-format off // oneDPL needs to be first to avoid issues with libstdc++ TBB impl #include -// force-top: off +// clang-format on #include "core/base/device_matrix_data_kernels.hpp" diff --git a/dpcpp/base/onedpl.hpp b/dpcpp/base/onedpl.hpp index 2f2f8ec3ab1..9dd5ba18976 100644 --- a/dpcpp/base/onedpl.hpp +++ b/dpcpp/base/onedpl.hpp @@ -5,11 +5,9 @@ #ifndef GKO_DPCPP_BASE_ONEDPL_HPP_ #define GKO_DPCPP_BASE_ONEDPL_HPP_ - -// force-top: on +// clang-format off #include -// force-top: off - +// clang-format on #include diff --git a/dpcpp/distributed/partition_helpers_kernels.dp.cpp b/dpcpp/distributed/partition_helpers_kernels.dp.cpp index 80eb073beee..8f85374c1d0 100644 --- a/dpcpp/distributed/partition_helpers_kernels.dp.cpp +++ b/dpcpp/distributed/partition_helpers_kernels.dp.cpp @@ -2,11 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -// force-top: on + +// clang-format off #include #include #include -// force-top: off +// clang-format on #include "core/distributed/partition_helpers_kernels.hpp" diff --git a/dpcpp/distributed/partition_kernels.dp.cpp b/dpcpp/distributed/partition_kernels.dp.cpp index 04bc0ee7cdc..3d2c403e35d 100644 --- a/dpcpp/distributed/partition_kernels.dp.cpp +++ b/dpcpp/distributed/partition_kernels.dp.cpp @@ -2,10 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -// force-top: on + +// clang-format off #include #include -// force-top: off +// clang-format off #include "core/distributed/partition_kernels.hpp" diff --git a/dpcpp/multigrid/pgm_kernels.dp.cpp b/dpcpp/multigrid/pgm_kernels.dp.cpp index 644755e363c..d25d44ed8e9 100644 --- a/dpcpp/multigrid/pgm_kernels.dp.cpp +++ b/dpcpp/multigrid/pgm_kernels.dp.cpp @@ -2,10 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -// force-top: on + +// clang-format off // oneDPL needs to be first to avoid issues with libstdc++ TBB impl #include -// force-top: off +// clang-format on #include "core/multigrid/pgm_kernels.hpp" diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/hip/factorization/par_ilut_select_common.hip.cpp index ddad307dc62..5486b3f5ba5 100644 --- a/hip/factorization/par_ilut_select_common.hip.cpp +++ b/hip/factorization/par_ilut_select_common.hip.cpp @@ -2,10 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -// force-top: on + +// clang-format off // prevent compilation failure related to disappearing assert(...) statements #include "common/cuda_hip/base/runtime.hpp" -// force-top: off +// clang-format on #include "hip/factorization/par_ilut_select_common.hip.hpp" diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp index cfdfc3122fd..266532823e7 100644 --- a/hip/test/base/hip_executor.hip.cpp +++ b/hip/test/base/hip_executor.hip.cpp @@ -2,10 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -// force-top: on + +// clang-format off // prevent compilation failure related to disappearing assert(...) statements #include -// force-top: off +// clang-format on #include diff --git a/hip/test/base/hip_executor_topology.hip.cpp b/hip/test/base/hip_executor_topology.hip.cpp index 7a94ae6ded2..10ebac1bbc6 100644 --- a/hip/test/base/hip_executor_topology.hip.cpp +++ b/hip/test/base/hip_executor_topology.hip.cpp @@ -2,10 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -// force-top: on + +// clang-format off // prevent compilation failure related to disappearing assert(...) statements #include -// force-top: off +// clang-format on #include diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp index 8462cbe5716..f018c634a6a 100644 --- a/hip/test/base/math.hip.cpp +++ b/hip/test/base/math.hip.cpp @@ -2,10 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -// force-top: on + +// clang-format off // prevent compilation failure related to disappearing assert(...) statements #include -// force-top: off +// clang-format on #include diff --git a/hip/test/base/scoped_device_id.hip.cpp b/hip/test/base/scoped_device_id.hip.cpp index 991baa80e3a..78d51fc989d 100644 --- a/hip/test/base/scoped_device_id.hip.cpp +++ b/hip/test/base/scoped_device_id.hip.cpp @@ -2,10 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -// force-top: on + +// clang-format off // prevent compilation failure related to disappearing assert(...) statements #include -// force-top: off +// clang-format on #include diff --git a/hip/test/components/cooperative_groups.hip.cpp b/hip/test/components/cooperative_groups.hip.cpp index 53f4b9a72a0..f99b4eb8a87 100644 --- a/hip/test/components/cooperative_groups.hip.cpp +++ b/hip/test/components/cooperative_groups.hip.cpp @@ -2,10 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -// force-top: on + +// clang-format off // TODO remove when the HIP includes are fixed #include -// force-top: off +// clang-format on #include diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp index b8ee2f03d29..be18447a901 100644 --- a/hip/test/components/merging.hip.cpp +++ b/hip/test/components/merging.hip.cpp @@ -2,10 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -// force-top: on + +// clang-format off // TODO remove when the HIP includes are fixed #include -// force-top: off +// clang-format on #include "hip/components/merging.hip.hpp" diff --git a/hip/test/components/searching.hip.cpp b/hip/test/components/searching.hip.cpp index 2662d367f4d..252e8841893 100644 --- a/hip/test/components/searching.hip.cpp +++ b/hip/test/components/searching.hip.cpp @@ -2,10 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -// force-top: on + +// clang-format off // TODO remove when the HIP includes are fixed #include -// force-top: off +// clang-format on #include "hip/components/searching.hip.hpp" From 3a2d1012950fe0e8233e86df506b6909cf322e82 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 16 Nov 2023 16:37:09 +0000 Subject: [PATCH 018/448] remove format_header.sh --- .github/bot-pr-base.sh | 2 - .github/bot-pr-format-base.sh | 5 +- .pre-commit-config.yaml | 13 - CMakeLists.txt | 20 -- dev_tools/scripts/config | 100 -------- dev_tools/scripts/format_header.sh | 393 ----------------------------- dev_tools/scripts/regroup | 12 - 7 files changed, 1 insertion(+), 544 deletions(-) delete mode 100644 dev_tools/scripts/config delete mode 100755 dev_tools/scripts/format_header.sh delete mode 100644 dev_tools/scripts/regroup diff --git a/.github/bot-pr-base.sh b/.github/bot-pr-base.sh index 697ecc7c848..61a86290db9 100644 --- a/.github/bot-pr-base.sh +++ b/.github/bot-pr-base.sh @@ -3,8 +3,6 @@ source .github/bot-base.sh EXTENSION_REGEX='\.(cuh?|hpp|hpp\.inc?|cpp)$' -FORMAT_HEADER_REGEX='^(benchmark|core|cuda|hip|include/ginkgo/core|omp|reference|dpcpp|common/unified|test)/' -FORMAT_REGEX='^(common|examples)/' CLANG_FORMAT=clang-format-14 echo -n "Collecting information on triggering PR" diff --git a/.github/bot-pr-format-base.sh b/.github/bot-pr-format-base.sh index 7c08dd605a1..8667f5b9473 100644 --- a/.github/bot-pr-format-base.sh +++ b/.github/bot-pr-format-base.sh @@ -14,9 +14,7 @@ git config user.name "ginkgo-bot" # save scripts from develop cp .clang-format .pre-commit-config.yaml /tmp -pushd dev_tools/scripts || exit 1 -cp format_header.sh update_ginkgo_header.sh /tmp -popd || exit 1 +cp dev_tools/scripts/update_ginkgo_header.sh /tmp # checkout current PR head LOCAL_BRANCH=format-tmp-$HEAD_BRANCH @@ -25,7 +23,6 @@ git checkout -b $LOCAL_BRANCH fork/$HEAD_BRANCH # restore files from develop cp /tmp/.clang-format . cp /tmp/.pre-commit-config.yaml . -cp /tmp/format_header.sh dev_tools/scripts/ cp /tmp/update_ginkgo_header.sh dev_tools/scripts/ # make base pre-commit config available diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 236f2bdea7b..9814e8fd810 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,19 +25,6 @@ repos: examples/external-lib-interfacing/external-lib-interfacing.cpp| core/base/workspace_aliases.hpp )$ - - id: format-headers - name: format headers - entry: env CLANG_FORMAT=dev_tools/scripts/clang-format.sh dev_tools/scripts/format_header.sh - require_serial: true - language: system - types_or: [c, c++, cuda] - exclude: | - (?x)^( - third_party/SuiteSparse/AMD/.*| - third_party/identify_stream_usage/.*| - include/ginkgo/ginkgo.hpp| - core/base/workspace_aliases.hpp - )$ - id: update-ginkgo-header name: update ginkgo header entry: dev_tools/scripts/update_ginkgo_header.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index d6ab1dbd936..21832c98592 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -367,26 +367,6 @@ if(NOT "${BASH}" STREQUAL "BASH-NOTFOUND" AND GINKGO_DEVEL_TOOLS) add_custom_target(generate_ginkgo_header ALL COMMAND ${Ginkgo_SOURCE_DIR}/dev_tools/scripts/update_ginkgo_header.sh WORKING_DIRECTORY ${Ginkgo_SOURCE_DIR}) - find_program(GIT git) - if(NOT "${GIT}" STREQUAL "GIT-NOTFOUND") - add_custom_target(format_header - COMMAND echo "format header on the modified code files except build, examples, third_party, accessor/, dev_tools, ginkgo.hpp" - COMMAND bash -c "git diff --name-only origin/master...HEAD | \ - grep -Ev 'build|examples|third_party|accessor/|dev_tools|ginkgo.hpp' | \ - grep -E '(\.hip)?\.(cu|hpp|cuh|cpp)$' | \ - xargs -r -n1 ${Ginkgo_SOURCE_DIR}/dev_tools/scripts/format_header.sh" - WORKING_DIRECTORY ${Ginkgo_SOURCE_DIR} - VERBATIM) - endif() - unset(GIT CACHE) - add_custom_target(format_header_all - COMMAND echo "format header on all code files except build, examples, third_party, accessor/, dev_tools, ginkgo.hpp" - COMMAND bash -c "find * -type f | \ - grep -Ev 'build|examples|third_party|accessor/|dev_tools|ginkgo.hpp' | \ - grep -E '(\.hip)?\.(cu|hpp|cuh|cpp)$' | \ - xargs -r -n1 ${Ginkgo_SOURCE_DIR}/dev_tools/scripts/format_header.sh" - WORKING_DIRECTORY ${Ginkgo_SOURCE_DIR} - VERBATIM) endif() unset(BASH CACHE) diff --git a/dev_tools/scripts/config b/dev_tools/scripts/config deleted file mode 100644 index 79e6a227530..00000000000 --- a/dev_tools/scripts/config +++ /dev/null @@ -1,100 +0,0 @@ -- "test_install|benchmark" - - FixInclude: "ginkgo/ginkgo.hpp" -- "executor" - - FixInclude: "ginkgo/core/base/executor.hpp" -- "hip/base/config.hip.hpp" - - FixInclude: "hip/hip_runtime.h" -- "hip/matrix/fft_kernels_stub" - - FixInclude: "core/matrix/fft_kernels.hpp" -- "(cuda|hip|omp|dpcpp)/test/factorization/par_ilu_kernels" - - FixInclude: "core/factorization/par_ilu_kernels.hpp" -- "(cuda|hip|omp|dpcpp)/test/factorization/par_ilut_kernels" - - FixInclude: "core/factorization/par_ilut_kernels.hpp" -- "(cuda|hip|omp|dpcpp)/test/factorization/par_ict_kernels" - - FixInclude: "core/factorization/par_ict_kernels.hpp" -- "(cuda|hip|omp|dpcpp)/test/factorization/par_ic_kernels" - - FixInclude: "core/factorization/par_ic_kernels.hpp" -- "cuda/factorization/par_ilut_select_common" - - FixInclude: "cuda/factorization/par_ilut_select_common.cuh" -- "hip/factorization/par_ilut_select_common" - - FixInclude: "hip/factorization/par_ilut_select_common.hip.hpp" -- "(cuda|hip|dpcpp)/factorization/par_ilut_" - - FixInclude: "core/factorization/par_ilut_kernels.hpp" -- "(cuda|hip|dpcpp)/factorization/par_ict_" - - FixInclude: "core/factorization/par_ict_kernels.hpp" -- "(cuda|hip|dpcpp)/preconditioner/jacobi_" - - FixInclude: "core/preconditioner/jacobi_kernels.hpp" -- "(cuda|hip|dpcpp|omp)/base/kernel_launch\." - - FixInclude: "common/unified/base/kernel_launch.hpp" -- "(cuda|hip|dpcpp|omp)/test/base/kernel_launch\." - - FixInclude: "common/unified/base/kernel_launch.hpp" -- "(cuda|hip|dpcpp|omp)/base/kernel_launch_solver\." - - FixInclude: "common/unified/base/kernel_launch_solver.hpp" -- "(cuda|hip|dpcpp|omp)/base/kernel_launch_solver\." - - FixInclude: "common/unified/base/kernel_launch_solver.hpp" -- "test/base/kernel_launch_generic.cpp" - - FixInclude: "common/unified/base/kernel_launch.hpp" -- "^test/solver/(lower|upper)_trs_kernels.cpp" - - CoreSuffix: "_kernels" - - PathPrefix: "ginkgo/core" - - PathIgnore: "0" - - RemoveTest: "true" -- "^test/matrix/csr_kernels2.cpp" - - CoreSuffix: "_kernels2" - - PathPrefix: "ginkgo/core" - - PathIgnore: "0" - - RemoveTest: "true" -- "elimination_forest\.cpp" - - FixInclude: "core/factorization/elimination_forest.hpp" -- "symbolic\.cpp" - - FixInclude: "core/factorization/symbolic.hpp" -- "common/unified/.*.cpp" - - PathIgnore: "2" - - PathPrefix: "core" - - CoreSuffix: "\.template" -- "core/test/base/(extended_float|iterator_factory)" - - RemoveTest: "true" -- "core/test/base/allocator" - - FixInclude: "core/base/allocator.hpp" -- "core/test/utils/matrix_utils_test" - - FixInclude: "core/utils/matrix_utils.hpp" -- "reference/test/base/utils" - - FixInclude: "core/base/utils.hpp" -- "_builder\.cpp" - - RemoveTest: "true" -- "_builder\.hpp" - - CoreSuffix: "_builder" -- "dpcpp/test/base/dim3\.dp\.cpp" - - FixInclude: "dpcpp/base/dim3.dp.hpp" -- "test/base/kernel_launch" - - RemoveTest: "true" - - PathIgnore: "1" - - PathPrefix: "(cuda|hip|omp|dpcpp)" -- "(cuda|hip|omp|dpcpp|reference)/base/.*_kernels" - - RemoveTest: "true" - - PathIgnore: "1" - - PathPrefix: "core" -- "/components/.*_kernels" - - RemoveTest: "true" - - PathIgnore: "1" - - PathPrefix: "core" -- "/components/" - - RemoveTest: "true" -- "test/utils" - - CoreSuffix: "_test" - - PathIgnore: "1" - - PathPrefix: "core" -- "core\/.*" - - CoreSuffix: "_kernels" - - PathPrefix: "ginkgo" - - PathIgnore: "0" - - RemoveTest: "true" -- "/(test|base)/" - - CoreSuffix: "_kernels" - - PathPrefix: "ginkgo/core" - - PathIgnore: "1" - - RemoveTest: "true" -- ".*" - - PathPrefix: "core" - - PathIgnore: "1" - - CoreSuffix: "\.template" diff --git a/dev_tools/scripts/format_header.sh b/dev_tools/scripts/format_header.sh deleted file mode 100755 index e7d51080b86..00000000000 --- a/dev_tools/scripts/format_header.sh +++ /dev/null @@ -1,393 +0,0 @@ -#!/usr/bin/env bash - -CLANG_FORMAT=${CLANG_FORMAT:="clang-format"} - -convert_header () { - local regex="^(#include )(<|\")(.*)(\"|>)$" - local jacobi_regex="^(cuda|hip|dpcpp)\/preconditioner\/jacobi_common(\.hip)?\.hpp" - if [[ $@ =~ ${regex} ]]; then - header_file="${BASH_REMATCH[3]}" - if [ -f "${header_file}" ]; then - if [[ "${header_file}" =~ ^ginkgo ]]; then - echo "#include <${header_file}>" - else - echo "#include \"${header_file}\"" - fi - elif [ "${header_file}" = "matrices/config.hpp" ]; then - echo "#include \"${header_file}\"" - elif [ "${header_file}" = "extensions/test/config/file_location.hpp" ]; then - echo "#include \"${header_file}\"" - elif [[ "${header_file}" =~ ${jacobi_regex} ]]; then - echo "#include \"${header_file}\"" - else - echo "#include <${header_file}>" - fi - else - echo "$@" - fi -} - -get_header_def () { - local regex="\.(hpp|cuh)" - if [[ $@ =~ $regex ]]; then - local def=$(echo "$@" | sed -E "s~include/ginkgo/~PUBLIC_~g;s~/|\.~_~g") - # Used to get rid of \r in Windows - def=$(echo "GKO_${def^^}_") - echo "$def" - else - echo "" - fi -} - -add_regroup () { - cp .clang-format .clang-format.temp - sed -i "s~\.\.\.~~g" .clang-format - cat dev_tools/scripts/regroup >> .clang-format - echo "..." >> .clang-format -} - -remove_regroup () { - mv .clang-format.temp .clang-format -} - -# It reads "dev_tools/scripts/config" to generate the corresponding main header -# The setting setting: -# - "file_regex" -# - CoreSuffix: "core_suffix_regex" (default "") -# - PathPrefix: "path_prefix_regex" (default "") -# - PathIgnore: "path_ignore_number" (default "0") -# - RemoveTest: "false/true" (default "false") -# - FixInclude: "the specific main header" (default "") -# Only "file_regex" without any setting is fine, and it means find the same name with header suffix -# For example, /path/to/file.cpp will change to /path/to/file.hpp -# file_regex : selecting which file apply this rule -# CoreSuffix : remove the pattern which passes the "core_suffix_regex" of file -# PathPrefix : adds "path_prefix_regex" before path, and the position depends on PathIgnore -# PathIgnore : ignore the number "path_ignore_number" folder from top level, and then add "path_prefix_regex" into path -# RemoveTest : Decide whether ignore /test/ in the path -# FixInclude : Specify the main header. If it is set, ignore others setting -# Note: This script picks the first fitting "file_regex" rules according the ordering in config -get_include_regex () { - local file="$1" - declare -n local_output=$2 - local core_suffix="" - local path_prefix="" - local path_ignore="0" - local fix_include="" - local remove_test="false" - local item_regex="^-\ +\"(.*)\"" - local path_prefix_regex="PathPrefix:\ +\"(.*)\"" - local core_suffix_regex="CoreSuffix:\ +\"(.*)\"" - local path_ignore_regex="PathIgnore:\ +\"(.*)\"" - local fix_include_regex="FixInclude:\ +\"(.*)\"" - local remove_test_regex="RemoveTest:\ +\"(.*)\"" - local match="false" - while IFS='' read -r line; do - if [[ "$line" =~ $item_regex ]]; then - file_regex="${BASH_REMATCH[1]}" - if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then - echo "DEBUG: Checking pattern $line" - fi - if [[ "$match" = "true" ]]; then - break - elif [[ $file =~ $file_regex ]]; then - if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then - echo "DEBUG: Matching pattern $line for $file" - fi - match="true" - fi - elif [ "$match" = "true" ]; then - if [[ "$line" =~ $path_prefix_regex ]]; then - path_prefix="${BASH_REMATCH[1]}" - if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then - echo "DEBUG: Path prefix set to $path_prefix" - fi - elif [[ "$line" =~ $core_suffix_regex ]]; then - core_suffix="${BASH_REMATCH[1]}" - if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then - echo "DEBUG: Core suffix set to $core_suffix" - fi - elif [[ "$line" =~ $path_ignore_regex ]]; then - path_ignore="${BASH_REMATCH[1]}" - if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then - echo "DEBUG: Ignoring $path_ignore top-level dirs" - fi - elif [[ "$line" =~ $fix_include_regex ]]; then - fix_include="${BASH_REMATCH[1]}" - if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then - echo "DEBUG: Fixed include $fix_include" - fi - elif [[ "$line" =~ $remove_test_regex ]]; then - remove_test="${BASH_REMATCH[1]}" - if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then - echo "DEBUG: Remove test $remove_test" - fi - else - echo "Ignore unknow setting: \"${file_regex}\" - ${line}" - fi - fi - done < "dev_tools/scripts/config" - local_output="" - if [ -z "${fix_include}" ]; then - local path_regex="([a-zA-Z_]*\/){${path_ignore}}(.*)\.(cpp|hpp|cu|cuh)" - if [ ! -z "${path_prefix}" ]; then - path_prefix="${path_prefix}/" - fi - if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then - echo "DEBUG: Handling $file" - fi - local_output=$(echo "${file}" | sed -E "s~\.(hip|dp)~~g;s~$path_regex~$path_prefix\2~g") - if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then - echo "DEBUG: After removing path_ignore and path_prefix: $local_output" - fi - local_output=$(echo "${local_output}" | sed -E "s~$core_suffix$~~g") - if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then - echo "DEBUG: After removing core_suffix: $local_output" - fi - local_output="#include (<|\")$local_output\.(hpp|hip\.hpp|dp\.hpp|cuh)(\"|>)" - if [ "${remove_test}" = "true" ]; then - local_output=$(echo "${local_output}" | sed -E "s~test/~~g") - if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then - echo "DEBUG: After removing test: ${local_output}" - fi - fi - else - if [ "$FORMAT_HEADER_DEBUG" = "1" ]; then - echo "DEBUG: Fixing include $fix_include to the top" - fi - local_output="#include (<|\")$fix_include(\"|>)" - fi -} - -# Test if required commands are present on the system: -if ! command -v "$CLANG_FORMAT" &> /dev/null; then - echo "The command 'clang-format' is required for this script to work, but not supported by your system. It can be set via environment parameter CLANG_FORMAT=" 1>&2 - exit 1 -fi - -# Test the command on MacOS -if ! declare -n &> /dev/null; then - echo "The command 'declare' needs to support the '-n' option. Please update bash or use 'brew install bash' if on MacOS" 1>&2 - exit 1 -fi - -touch .dummy_file -if ! sed -i 's///g' .dummy_file &> /dev/null; then - echo "The command 'sed' needs to support the '-i' option without suffix. Please use gnu sed or use 'brew install gnu-sed' if on MacOS" 1>&2 - rm .dummy_file - exit 1 -fi - -if ! head -n -1 .dummy_file &> /dev/null; then - echo "The command 'head' needs to support '-NUM' option, Please use gnu head or use 'brew install coreutils' if on MacOS" 1>&2 - rm .dummy_file - exit 1 -fi -rm .dummy_file - -for current_file in $@; do - if [ -z "${current_file}" ]; then - echo "Usage: $0 path/to/fileA path/to/fileB ..." - exit 1 - fi - - if [ ! -f "${current_file}" ]; then - echo "${current_file} does not exist or it is not a file." - exit 1 - fi - - GINKGO_LICENSE_BEGIN="// SPDX-FileCopyrightText:" - GINKGO_LICENSE_END="// SPDX-License-Identifier:" - - CONTENT="content.cpp" # Store the residual part (start from namespace) - BEFORE="before.cpp" # Store the main header and the #ifdef/#define of header file - HAS_HIP_RUNTIME="false" - DURING_LICENSE="false" - INCLUDE_REGEX="^#include.*" - INCLUDE_INC="\.inc" - MAIN_PART_MATCH="" - - # FORCE_TOP_ON/OFF is only valid before other #include - FORCE_TOP_ON="// force-top: on" - FORCE_TOP_OFF="// force-top: off" - FORCE_TOP="force_top" - DURING_FORCE_TOP="false" - - get_include_regex "${current_file}" MAIN_PART_MATCH - HEADER_DEF=$(get_header_def "${current_file}") - - IFNDEF="" - DEFINE="" - IFNDEF_REGEX="^#ifndef GKO_" - DEFINE_REGEX="^#define GKO_" - HEADER_REGEX="\.(hpp|cuh)" - SKIP="true" - START_BLOCK_REX="^(#if| *\/\*)" - END_BLOCK_REX="^#endif|\*\/$" - ENDIF_REX="^#endif" - IN_BLOCK=0 - KEEP_LINES=0 - LAST_NONEMPTY="" - ALARM="" - COMMENT_REGEX="^ *\/\/" - CONSIDER_REGEX="${START_BLOCK_REX}|${END_BLOCK_REX}|${COMMENT_REGEX}|${INCLUDE_REGEX}" - - # This part capture the main header and give the possible fail arrangement information - while IFS='' read -r line || [ -n "$line" ]; do - if [[ "${line}" =~ ${GINKGO_LICENSE_BEGIN} ]] || [ "${DURING_LICENSE}" = "true" ]; then - DURING_LICENSE="true" - if [[ "${line}" =~ ${GINKGO_LICENSE_END} ]]; then - DURING_LICENSE="false" - SKIP="true" - fi - elif [ "${SKIP}" = "true" ] && ([ "$line" = "${FORCE_TOP_ON}" ] || [ "${DURING_FORCE_TOP}" = "true" ]); then - DURING_FORCE_TOP="true" - if [ "$line" = "${FORCE_TOP_OFF}" ]; then - DURING_FORCE_TOP="false" - fi - if [[ "${line}" =~ $INCLUDE_REGEX ]]; then - line="$(convert_header "${line}")" - fi - echo "$line" >> "${FORCE_TOP}" - elif [ -z "${line}" ] && [ "${SKIP}" = "true" ]; then - # Ignore all empty lines between LICENSE and Header - : - else - if [[ "${line}" =~ $INCLUDE_REGEX ]]; then - line="$(convert_header "${line}")" - fi - if [ -z "${line}" ]; then - KEEP_LINES=$((KEEP_LINES+1)) - else - LAST_NONEMPTY="${line}" - KEEP_LINES=0 - fi - if [[ "${current_file}" =~ ${HEADER_REGEX} ]] && [[ "${line}" =~ ${IFNDEF_REGEX} ]] && [ "${SKIP}" = "true" ] && [ -z "${DEFINE}" ]; then - IFNDEF="${line}" - elif [[ "${current_file}" =~ ${HEADER_REGEX} ]] && [[ "${line}" =~ ${DEFINE_REGEX} ]] && [ "${SKIP}" = "true" ] && [ -n "${IFNDEF}" ]; then - DEFINE="${line}" - elif [ -z "${MAIN_PART_MATCH}" ] || [[ ! "${line}" =~ ${MAIN_PART_MATCH} ]] || [[ "${IN_BLOCK}" -gt 0 ]]; then - echo "${line}" >> "${CONTENT}" - SKIP="false" - if [[ "${line}" =~ $START_BLOCK_REX ]]; then - # keep everything in #if block and /* block - IN_BLOCK=$((IN_BLOCK+1)) - if [ -z "${ALARM}" ]; then - ALARM="set" - fi - fi - if [[ "${IN_BLOCK}" = "0" ]] && [ -n "${line}" ] && [[ ! "${line}" =~ ${CONSIDER_REGEX} ]]; then - if [ "${ALARM}" = "set" ]; then - ALARM="true" - elif [ -z "${ALARM}" ]; then - ALARM="false" - fi - fi - if [[ "${line}" =~ $END_BLOCK_REX ]]; then - IN_BLOCK=$((IN_BLOCK-1)) - fi - else - echo "${line}" >> ${BEFORE} - fi - fi - done < "${current_file}" - if [ "${ALARM}" = "true" ]; then - echo "Warning ${current_file}: sorting is probably incorrect" - fi - - # Write license - CURRENT_YEAR=$(date +%Y) - echo "${GINKGO_LICENSE_BEGIN} 2017 - ${CURRENT_YEAR} The Ginkgo authors" > "${current_file}" - echo "//" >> "${current_file}" - echo "${GINKGO_LICENSE_END} BSD-3-Clause" >> "${current_file}" - echo "" >> "${current_file}" - - # Write the definition of header according to path - if [ -n "${IFNDEF}" ] && [ -n "${DEFINE}" ]; then - IFNDEF="#ifndef ${HEADER_DEF}" - DEFINE="#define ${HEADER_DEF}" - elif [ -z "${IFNDEF}" ] && [ -z "${DEFINE}" ]; then - : - else - echo "Warning ${current_file}: only #ifndef GKO_ or #define GKO_ is in the header" - fi - if [ -n "${IFNDEF}" ]; then - echo "${IFNDEF}" >> "${current_file}" - fi - if [ -n "${DEFINE}" ]; then - echo "${DEFINE}" >> "${current_file}" - echo "" >> "${current_file}" - echo "" >> "${current_file}" - fi - - # Write the force-top header - if [ -f "${FORCE_TOP}" ]; then - cat "${FORCE_TOP}" >> "${current_file}" - echo "" >> "${current_file}" - echo "" >> "${current_file}" - rm "${FORCE_TOP}" - fi - - # Write the main header and give warnning if there are multiple matches - if [ -f "${BEFORE}" ]; then - # sort or remove the duplication - "${CLANG_FORMAT}" -i -style=file ${BEFORE} - if [ "$(wc -l < ${BEFORE})" -gt "1" ]; then - echo "Warning ${current_file}: there are multiple main header matchings" - fi - cat ${BEFORE} >> "${current_file}" - if [ -f "${CONTENT}" ]; then - echo "" >> "${current_file}" - echo "" >> "${current_file}" - fi - rm "${BEFORE}" - fi - - # Arrange the remain files and give - if [ -f "${CONTENT}" ]; then - add_regroup - head -n -${KEEP_LINES} ${CONTENT} >> temp - if [ -n "${IFNDEF}" ] && [ -n "${DEFINE}" ]; then - # Ignore the last line #endif - if [[ "${LAST_NONEMPTY}" =~ $ENDIF_REX ]]; then - head -n -1 temp > ${CONTENT} - echo "#endif // $HEADER_DEF" >> ${CONTENT} - else - echo "Warning ${current_file}: Found the begin header_def but did not find the end of header_def" - cat temp > ${CONTENT} - fi - else - cat temp > "${CONTENT}" - fi - "${CLANG_FORMAT}" -i -style=file "${CONTENT}" - rm temp - remove_regroup - PREV_INC=0 - IN_IF="false" - SKIP="true" - while IFS='' read -r line; do - # Skip the empty line in the beginning - if [ "${SKIP}" = "true" ] && [[ -z "${line}" ]]; then - continue - else - SKIP="false" - fi - # Insert content with correct number empty lines - if [[ ${line} =~ ${INCLUDE_REGEX} ]] && [[ ! ${line} =~ ${INCLUDE_INC} ]]; then - if [[ ${PREV_INC} == 1 ]]; then - echo "" >> "${current_file}" - fi - PREV_INC=0 - else - if [ -z "${line}" ]; then - PREV_INC=$((PREV_INC+1)) - else - # To keep the original lines - PREV_INC=-3 - fi - fi - echo "${line}" >> "${current_file}" - done < "${CONTENT}" - rm "${CONTENT}" - fi -done diff --git a/dev_tools/scripts/regroup b/dev_tools/scripts/regroup deleted file mode 100644 index b10570f4982..00000000000 --- a/dev_tools/scripts/regroup +++ /dev/null @@ -1,12 +0,0 @@ -IncludeBlocks: Regroup -IncludeCategories: - - Regex: '^<(nlohmann|gflags|gtest|papi).*' - Priority: 3 - - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative|oneapi|mpi|nvToolsExt|Kokkos_Core).*' - Priority: 2 - - Regex: '^ Date: Fri, 17 Nov 2023 11:30:09 +0000 Subject: [PATCH 019/448] remove formatting from convert_source.sh --- dev_tools/oneapi/convert_source.sh | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/dev_tools/oneapi/convert_source.sh b/dev_tools/oneapi/convert_source.sh index 7aaecf2e78d..090c31e8ccb 100755 --- a/dev_tools/oneapi/convert_source.sh +++ b/dev_tools/oneapi/convert_source.sh @@ -17,7 +17,6 @@ # CMake's step is not required if copying the ginkgo config.hpp from another ginkgo build into "${ROOT_DIR}/include/ginkgo/". # ROOT_BUILD_DIR: the complete path for build folder. The default is "${ROOT_DIR}/${BUILD_DIR}" # GTEST_HEADER_DIR: the gtest header folder. The default is "${ROOT_BUILD_DIR}/_deps/googletest-src/googletest/include" -# CLANG_FORMAT: the clang-format exec. The default is "clang-format" # VERBOSE: if it is set as 1, script will output the path information CURRENT_DIR="$( pwd )" cd "$( dirname "${BASH_SOURCE[0]}" )" @@ -30,7 +29,6 @@ BUILD_DIR="${BUILD_DIR:="build"}" ROOT_BUILD_DIR="${ROOT_BUILD_DIR:="${ROOT_DIR}/${BUILD_DIR}"}" CUDA_HEADER_DIR="${CUDA_HEADER_DIR}" GTEST_HEADER_DIR="${GTEST_HEADER_DIR:="${ROOT_BUILD_DIR}/_deps/googletest-src/googletest/include"}" -CLANG_FORMAT=${CLANG_FORMAT:="clang-format"} if [[ "${VERBOSE}" == 1 ]]; then echo "#####################" echo "# Environment Setting:" @@ -40,7 +38,6 @@ if [[ "${VERBOSE}" == 1 ]]; then echo "ROOT_BUILD_DIR ${ROOT_BUILD_DIR}" echo "GTEST_HEADER_DIR ${GTEST_HEADER_DIR}" echo "CUDA_HEADER_DIR ${CUDA_HEADER_DIR}" - echo "CLANG_FORMAT ${CLANG_FORMAT}" echo "#####################" fi if [[ "${CUDA_HEADER_DIR}" == "" ]]; then @@ -166,9 +163,8 @@ if [[ "${VERBOSE}" == 1 ]]; then fi rm "${OUTPUT_FILE}" echo "#define GET_QUEUE 0" >> "${OUTPUT_FILE}" -# add empty ginkgo license such that format_header recognize some header before header def macro CURRENT_YEAR=$(date +%Y) -echo "${GINKGO_LICENSE_BEGIN} 2017-${CURRENT_YEAR} The Ginkgo authors" >> "${OUTPUT_FILE}" +echo "${GINKGO_LICENSE_BEGIN} ${CURRENT_YEAR} The Ginkgo authors" >> "${OUTPUT_FILE}" echo "//" >> "${OUTPUT_FILE}" echo "${GINKGO_LICENSE_END} BSD-3-Clause" >> "${OUTPUT_FILE}" rm "${GLOBAL_FILE}" @@ -191,9 +187,6 @@ while IFS='' read -r line; do fi done < "${UNFORMAT_FILE}" -# Call clang-format for better formatting. -${CLANG_FORMAT} -style=file "${EMBED_FILE}" > "${FORMAT_FILE}" - # Add an extra host function so that the converted DPC++ code will look like CUDA. "${SCRIPT_DIR}/add_host_function.sh" "${FORMAT_FILE}" > "${EMBED_HOST_FILE}" From a1ec48e86b1533fbcbea8b98329566aa31f175e2 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 14 Jun 2024 08:51:57 +0000 Subject: [PATCH 020/448] use clang-format to format includes Co-authored-by: Yu-Hsiang M. Tsai --- .clang-format | 25 ++++++++++++++++++++++++- .pre-commit-config.yaml | 3 ++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/.clang-format b/.clang-format index 3e1d9335acf..acd1e4321c2 100644 --- a/.clang-format +++ b/.clang-format @@ -58,7 +58,30 @@ ForEachMacros: - foreach - Q_FOREACH - BOOST_FOREACH -IncludeIsMainRegex: '([-_](test|unittest))?$' +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^' # standard library + Priority: 1 + - Regex: '(^<(hip/hip_runtime|cuda(_runtime)?)\.h)|common/cuda_hip/base/runtime\.hpp$' + Priority: 2 + SortPriority: 2 + - Regex: '^<(omp|cu|hip|oneapi|thrust|CL/|cooperative|mpi|nvToolsExt).*' + Priority: 2 + SortPriority: 3 + - Regex: '^<(nlohmann|gflags|gtest|sde_lib|papi).*' + Priority: 4 + - Regex: '' + Priority: 6 + - Regex: '^ Date: Fri, 28 Jun 2024 09:34:01 +0200 Subject: [PATCH 021/448] remove force-top where made unnecessary --- cuda/test/base/scoped_device_id.cu | 3 --- dpcpp/base/device_matrix_data_kernels.dp.cpp | 4 +--- dpcpp/base/onedpl.hpp | 4 ++-- dpcpp/distributed/partition_helpers_kernels.dp.cpp | 2 -- dpcpp/distributed/partition_kernels.dp.cpp | 2 -- dpcpp/multigrid/pgm_kernels.dp.cpp | 3 --- 6 files changed, 3 insertions(+), 15 deletions(-) diff --git a/cuda/test/base/scoped_device_id.cu b/cuda/test/base/scoped_device_id.cu index 4abd8f5810b..5c2e496b64b 100644 --- a/cuda/test/base/scoped_device_id.cu +++ b/cuda/test/base/scoped_device_id.cu @@ -2,10 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -// clang-format off -// prevent compilation failure related to disappearing assert(...) statements #include -// clang-format on #include diff --git a/dpcpp/base/device_matrix_data_kernels.dp.cpp b/dpcpp/base/device_matrix_data_kernels.dp.cpp index 5869c853385..a735470d5ba 100644 --- a/dpcpp/base/device_matrix_data_kernels.dp.cpp +++ b/dpcpp/base/device_matrix_data_kernels.dp.cpp @@ -2,10 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -// clang-format off -// oneDPL needs to be first to avoid issues with libstdc++ TBB impl + #include -// clang-format on #include "core/base/device_matrix_data_kernels.hpp" diff --git a/dpcpp/base/onedpl.hpp b/dpcpp/base/onedpl.hpp index 9dd5ba18976..8ea971f4602 100644 --- a/dpcpp/base/onedpl.hpp +++ b/dpcpp/base/onedpl.hpp @@ -5,9 +5,9 @@ #ifndef GKO_DPCPP_BASE_ONEDPL_HPP_ #define GKO_DPCPP_BASE_ONEDPL_HPP_ -// clang-format off + #include -// clang-format on + #include diff --git a/dpcpp/distributed/partition_helpers_kernels.dp.cpp b/dpcpp/distributed/partition_helpers_kernels.dp.cpp index 8f85374c1d0..c7a94baad54 100644 --- a/dpcpp/distributed/partition_helpers_kernels.dp.cpp +++ b/dpcpp/distributed/partition_helpers_kernels.dp.cpp @@ -3,11 +3,9 @@ // SPDX-License-Identifier: BSD-3-Clause -// clang-format off #include #include #include -// clang-format on #include "core/distributed/partition_helpers_kernels.hpp" diff --git a/dpcpp/distributed/partition_kernels.dp.cpp b/dpcpp/distributed/partition_kernels.dp.cpp index 3d2c403e35d..5eeb2f85178 100644 --- a/dpcpp/distributed/partition_kernels.dp.cpp +++ b/dpcpp/distributed/partition_kernels.dp.cpp @@ -3,10 +3,8 @@ // SPDX-License-Identifier: BSD-3-Clause -// clang-format off #include #include -// clang-format off #include "core/distributed/partition_kernels.hpp" diff --git a/dpcpp/multigrid/pgm_kernels.dp.cpp b/dpcpp/multigrid/pgm_kernels.dp.cpp index d25d44ed8e9..3241c8b1ed1 100644 --- a/dpcpp/multigrid/pgm_kernels.dp.cpp +++ b/dpcpp/multigrid/pgm_kernels.dp.cpp @@ -3,10 +3,7 @@ // SPDX-License-Identifier: BSD-3-Clause -// clang-format off -// oneDPL needs to be first to avoid issues with libstdc++ TBB impl #include -// clang-format on #include "core/multigrid/pgm_kernels.hpp" From 65c7d504cf521d84ae35ca3d7aa08faebc5c5fd0 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 28 Jun 2024 09:47:27 +0200 Subject: [PATCH 022/448] fixup! use clang-format to format includes --- .clang-format | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.clang-format b/.clang-format index acd1e4321c2..8bb6ededfdb 100644 --- a/.clang-format +++ b/.clang-format @@ -67,7 +67,7 @@ IncludeCategories: - Regex: '(^<(hip/hip_runtime|cuda(_runtime)?)\.h)|common/cuda_hip/base/runtime\.hpp$' Priority: 2 SortPriority: 2 - - Regex: '^<(omp|cu|hip|oneapi|thrust|CL/|cooperative|mpi|nvToolsExt).*' + - Regex: '^<(omp|cu|hip|oneapi|thrust|CL/|cooperative|mpi|nvToolsExt|Kokkos).*' Priority: 2 SortPriority: 3 - Regex: '^<(nlohmann|gflags|gtest|sde_lib|papi).*' From b9545ddf0d1805331f30e00f79a75a108a86a62e Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 28 Jun 2024 09:48:01 +0200 Subject: [PATCH 023/448] automatically update header ordering with clang-format --- accessor/accessor_helper.hpp | 1 - accessor/block_col_major.hpp | 1 - accessor/cuda_helper.hpp | 2 -- accessor/hip_helper.hpp | 2 -- accessor/math.hpp | 1 - accessor/range.hpp | 1 - accessor/reduced_row_major.hpp | 1 - accessor/reduced_row_major_reference.hpp | 1 - accessor/reference_helper.hpp | 1 - accessor/row_major.hpp | 1 - accessor/scaled_reduced_row_major.hpp | 1 - accessor/scaled_reduced_row_major_reference.hpp | 1 - benchmark/blas/blas.cpp | 4 +--- benchmark/blas/blas_common.hpp | 4 +--- benchmark/blas/distributed/multi_vector.cpp | 5 ++--- benchmark/conversion/conversion.cpp | 4 +--- benchmark/matrix_generator/matrix_generator.cpp | 4 +--- benchmark/matrix_statistics/matrix_statistics.cpp | 5 +---- benchmark/preconditioner/preconditioner.cpp | 4 +--- benchmark/solver/distributed/solver.cpp | 5 ++--- benchmark/solver/solver.cpp | 4 +--- benchmark/sparse_blas/operations.cpp | 5 ++--- benchmark/sparse_blas/operations.hpp | 4 +--- benchmark/sparse_blas/sparse_blas.cpp | 4 +--- benchmark/spmv/distributed/spmv.cpp | 5 ++--- benchmark/spmv/spmv.cpp | 4 +--- benchmark/tools/matrix.cpp | 2 -- benchmark/tools/mtx_to_binary.cpp | 1 - benchmark/utils/cuda_linops.cpp | 5 +---- benchmark/utils/cuda_timer.cpp | 1 - benchmark/utils/dpcpp_linops.dp.cpp | 5 +---- benchmark/utils/dpcpp_timer.dp.cpp | 2 -- benchmark/utils/formats.hpp | 5 +---- benchmark/utils/general.hpp | 6 +----- benchmark/utils/general_matrix.hpp | 4 +--- benchmark/utils/generator.hpp | 1 - benchmark/utils/hip_linops.hip.cpp | 4 +--- benchmark/utils/hip_timer.hip.cpp | 1 - benchmark/utils/iteration_control.hpp | 4 +--- benchmark/utils/loggers.hpp | 4 +--- benchmark/utils/mpi_timer.cpp | 1 - benchmark/utils/overhead_linop.hpp | 2 -- benchmark/utils/preconditioners.hpp | 5 +---- benchmark/utils/runner.hpp | 4 +--- benchmark/utils/sparselib_linops.hpp | 1 - benchmark/utils/timer.hpp | 5 +---- benchmark/utils/timer_impl.hpp | 5 ++--- benchmark/utils/tuning_variables.cpp | 5 ++--- benchmark/utils/types.hpp | 1 - cmake/openmpi_test.cpp | 1 - common/cuda_hip/base/thrust.hpp | 1 - common/unified/base/device_matrix_data_kernels.cpp | 2 -- common/unified/base/index_set_kernels.cpp | 2 -- common/unified/base/kernel_launch.hpp | 1 - common/unified/components/absolute_array_kernels.cpp | 1 - common/unified/components/fill_array_kernels.cpp | 1 - common/unified/components/format_conversion_kernels.cpp | 2 -- .../unified/components/precision_conversion_kernels.cpp | 1 - common/unified/components/reduce_array_kernels.cpp | 2 -- common/unified/distributed/partition_helpers_kernels.cpp | 1 - common/unified/distributed/partition_kernels.cpp | 1 - common/unified/matrix/coo_kernels.cpp | 2 -- common/unified/matrix/csr_kernels.cpp | 3 --- common/unified/matrix/dense_kernels.template.cpp | 2 -- common/unified/matrix/diagonal_kernels.cpp | 2 -- common/unified/matrix/ell_kernels.cpp | 2 -- common/unified/matrix/hybrid_kernels.cpp | 1 - common/unified/matrix/permutation_kernels.cpp | 2 -- common/unified/matrix/scaled_permutation_kernels.cpp | 2 -- common/unified/matrix/sellp_kernels.cpp | 2 -- common/unified/matrix/sparsity_csr_kernels.cpp | 2 -- common/unified/multigrid/pgm_kernels.cpp | 2 -- common/unified/preconditioner/jacobi_kernels.cpp | 2 -- common/unified/solver/bicg_kernels.cpp | 2 -- common/unified/solver/bicgstab_kernels.cpp | 2 -- common/unified/solver/cg_kernels.cpp | 2 -- common/unified/solver/cgs_kernels.cpp | 2 -- common/unified/solver/common_gmres_kernels.cpp | 2 -- common/unified/solver/fcg_kernels.cpp | 2 -- common/unified/solver/gcr_kernels.cpp | 2 -- common/unified/solver/gmres_kernels.cpp | 2 -- common/unified/solver/ir_kernels.cpp | 1 - core/base/allocator.hpp | 1 - core/base/array.cpp | 2 -- core/base/batch_multi_vector.cpp | 3 --- core/base/batch_multi_vector_kernels.hpp | 3 --- core/base/batch_utilities.hpp | 1 - core/base/block_operator.cpp | 3 --- core/base/combination.cpp | 1 - core/base/composition.cpp | 3 --- core/base/dense_cache.cpp | 1 - core/base/device_matrix_data.cpp | 2 -- core/base/device_matrix_data_kernels.hpp | 6 +----- core/base/dispatch_helper.hpp | 1 - core/base/executor.cpp | 1 - core/base/extended_float.hpp | 1 - core/base/index_range.hpp | 1 - core/base/index_set.cpp | 3 --- core/base/index_set_kernels.hpp | 4 +--- core/base/iterator_factory.hpp | 1 - core/base/memory.cpp | 2 -- core/base/mpi.cpp | 1 - core/base/mtx_io.cpp | 2 -- core/base/perturbation.cpp | 1 - core/base/segmented_array.cpp | 1 - core/base/timer.cpp | 3 --- core/base/utils.hpp | 5 +---- core/base/workspace_aliases.hpp | 1 - core/components/absolute_array_kernels.hpp | 2 -- core/components/addressable_pq.hpp | 2 -- core/components/fill_array_kernels.hpp | 2 -- core/components/format_conversion_kernels.hpp | 2 -- core/components/precision_conversion_kernels.hpp | 2 -- core/components/prefix_sum_kernels.hpp | 2 -- core/components/reduce_array_kernels.hpp | 2 -- core/config/config.cpp | 3 --- core/config/config_helper.cpp | 5 ++--- core/config/config_helper.hpp | 2 -- core/config/dispatch.hpp | 2 -- core/config/factorization_config.cpp | 1 - core/config/multigrid_config.cpp | 4 +--- core/config/parse_macro.hpp | 1 - core/config/preconditioner_config.cpp | 1 - core/config/property_tree.cpp | 1 - core/config/registry.cpp | 2 -- core/config/registry_accessor.hpp | 1 - core/config/solver_config.cpp | 4 ++-- core/config/solver_config.hpp | 1 - core/config/stop_config.cpp | 4 ++-- core/config/trisolver_config.hpp | 1 - core/config/type_descriptor.cpp | 2 -- core/config/type_descriptor_helper.hpp | 1 - core/device_hooks/common_kernels.inc.cpp | 1 - core/device_hooks/cuda_hooks.cpp | 1 - core/device_hooks/dpcpp_hooks.cpp | 1 - core/device_hooks/hip_hooks.cpp | 1 - core/distributed/helpers.hpp | 2 -- core/distributed/index_map.cpp | 1 - core/distributed/index_map_kernels.hpp | 5 +---- core/distributed/matrix.cpp | 2 -- core/distributed/matrix_kernels.hpp | 1 - core/distributed/partition.cpp | 1 - core/distributed/partition_helpers.cpp | 3 --- core/distributed/partition_helpers_kernels.hpp | 1 - core/distributed/partition_kernels.hpp | 1 - core/distributed/preconditioner/schwarz.cpp | 3 --- core/distributed/vector.cpp | 2 -- core/distributed/vector_kernels.hpp | 1 - core/factorization/cholesky.cpp | 2 -- core/factorization/cholesky_kernels.hpp | 2 -- core/factorization/elimination_forest.cpp | 1 - core/factorization/elimination_forest.hpp | 1 - core/factorization/factorization.cpp | 2 -- core/factorization/factorization_kernels.hpp | 2 -- core/factorization/ic.cpp | 3 --- core/factorization/ic_kernels.hpp | 6 +----- core/factorization/ilu.cpp | 3 --- core/factorization/ilu_kernels.hpp | 6 +----- core/factorization/lu.cpp | 2 -- core/factorization/lu_kernels.hpp | 2 -- core/factorization/par_ic.cpp | 3 --- core/factorization/par_ic_kernels.hpp | 6 +----- core/factorization/par_ict.cpp | 3 --- core/factorization/par_ict_kernels.hpp | 6 +----- core/factorization/par_ilu.cpp | 3 --- core/factorization/par_ilu_kernels.hpp | 6 +----- core/factorization/par_ilut.cpp | 3 --- core/factorization/par_ilut_kernels.hpp | 6 +----- core/factorization/symbolic.cpp | 2 -- core/factorization/symbolic.hpp | 1 - core/log/batch_logger.cpp | 2 -- core/log/convergence.cpp | 2 -- core/log/papi.cpp | 2 -- core/log/performance_hint.cpp | 1 - core/log/profiler_hook.cpp | 3 --- core/log/profiler_hook_summary.cpp | 1 - core/log/profiler_hook_summary_writer.cpp | 1 - core/log/record.cpp | 1 - core/log/stream.cpp | 2 -- core/matrix/batch_csr.cpp | 3 --- core/matrix/batch_csr_kernels.hpp | 5 +---- core/matrix/batch_dense.cpp | 3 --- core/matrix/batch_dense_kernels.hpp | 5 +---- core/matrix/batch_ell.cpp | 3 --- core/matrix/batch_ell_kernels.hpp | 5 +---- core/matrix/batch_identity.cpp | 2 -- core/matrix/coo.cpp | 3 --- core/matrix/coo_kernels.hpp | 5 +---- core/matrix/csr.cpp | 2 -- core/matrix/csr_accessor_helper.hpp | 1 - core/matrix/csr_kernels.hpp | 5 +---- core/matrix/csr_lookup.hpp | 1 - core/matrix/dense.cpp | 3 --- core/matrix/dense_kernels.hpp | 6 +----- core/matrix/diagonal.cpp | 2 -- core/matrix/diagonal_kernels.hpp | 5 +---- core/matrix/ell.cpp | 3 --- core/matrix/ell_kernels.hpp | 5 +---- core/matrix/fbcsr.cpp | 3 --- core/matrix/fbcsr_kernels.hpp | 5 +---- core/matrix/fft.cpp | 2 -- core/matrix/hybrid.cpp | 3 --- core/matrix/hybrid_kernels.hpp | 5 +---- core/matrix/identity.cpp | 1 - core/matrix/permutation.cpp | 2 -- core/matrix/permutation.hpp | 4 +--- core/matrix/permutation_kernels.hpp | 1 - core/matrix/row_gatherer.cpp | 2 -- core/matrix/scaled_permutation.cpp | 2 -- core/matrix/scaled_permutation_kernels.hpp | 1 - core/matrix/sellp.cpp | 2 -- core/matrix/sellp_kernels.hpp | 5 +---- core/matrix/sparsity_csr.cpp | 2 -- core/matrix/sparsity_csr_kernels.hpp | 5 +---- core/mpi/exception.cpp | 2 -- core/multigrid/fixed_coarsening.cpp | 2 -- core/multigrid/pgm.cpp | 2 -- core/multigrid/pgm_kernels.hpp | 2 -- core/preconditioner/batch_jacobi.cpp | 1 - core/preconditioner/batch_jacobi_kernels.hpp | 5 +---- core/preconditioner/ic.cpp | 2 -- core/preconditioner/ilu.cpp | 2 -- core/preconditioner/isai.cpp | 3 --- core/preconditioner/isai_kernels.hpp | 5 +---- core/preconditioner/jacobi.cpp | 3 --- core/preconditioner/jacobi_kernels.hpp | 5 +---- core/preconditioner/jacobi_utils.hpp | 1 - core/reorder/amd.cpp | 3 --- core/reorder/mc64.cpp | 3 --- core/reorder/mc64.hpp | 5 +---- core/reorder/nested_dissection.cpp | 2 -- core/reorder/rcm.cpp | 3 --- core/reorder/rcm_kernels.hpp | 6 +----- core/reorder/scaled_reordered.cpp | 2 -- core/solver/batch_bicgstab.cpp | 2 -- core/solver/batch_bicgstab_kernels.hpp | 1 - core/solver/batch_cg.cpp | 2 -- core/solver/batch_cg_kernels.hpp | 1 - core/solver/batch_dispatch.hpp | 1 - core/solver/bicg.cpp | 2 -- core/solver/bicg_kernels.hpp | 2 -- core/solver/bicgstab.cpp | 2 -- core/solver/bicgstab_kernels.hpp | 2 -- core/solver/cb_gmres.cpp | 3 --- core/solver/cb_gmres_accessor.hpp | 2 -- core/solver/cb_gmres_kernels.hpp | 1 - core/solver/cg.cpp | 2 -- core/solver/cg_kernels.hpp | 2 -- core/solver/cgs.cpp | 2 -- core/solver/cgs_kernels.hpp | 2 -- core/solver/common_gmres_kernels.hpp | 1 - core/solver/direct.cpp | 3 --- core/solver/fcg.cpp | 2 -- core/solver/fcg_kernels.hpp | 2 -- core/solver/gcr.cpp | 2 -- core/solver/gcr_kernels.hpp | 1 - core/solver/gmres.cpp | 2 -- core/solver/gmres_kernels.hpp | 1 - core/solver/idr.cpp | 2 -- core/solver/idr_kernels.hpp | 1 - core/solver/ir.cpp | 2 -- core/solver/ir_kernels.hpp | 2 -- core/solver/lower_trs.cpp | 1 - core/solver/lower_trs_kernels.hpp | 2 -- core/solver/multigrid.cpp | 3 --- core/solver/multigrid_kernels.hpp | 1 - core/solver/upper_trs.cpp | 1 - core/solver/upper_trs_kernels.hpp | 2 -- core/stop/criterion.cpp | 1 - core/stop/criterion_kernels.hpp | 1 - core/stop/residual_norm.cpp | 2 -- core/synthesizer/implementation_selection.hpp | 1 - core/test/accessor/block_col_major.cpp | 5 ++--- core/test/accessor/index_span.cpp | 5 ++--- core/test/accessor/math.cpp | 6 ++---- core/test/accessor/range.cpp | 6 ++---- core/test/accessor/reduced_row_major.cpp | 5 ++--- core/test/accessor/reduced_row_major_ginkgo.cpp | 2 -- core/test/accessor/reduced_row_major_reference.cpp | 5 ++--- core/test/accessor/row_major.cpp | 5 ++--- core/test/accessor/scaled_reduced_row_major.cpp | 5 ++--- .../test/accessor/scaled_reduced_row_major_reference.cpp | 5 ++--- core/test/base/abstract_factory.cpp | 5 ++--- core/test/base/allocator.cpp | 2 -- core/test/base/array.cpp | 7 +------ core/test/base/batch_dim.cpp | 6 ++---- core/test/base/batch_lin_op.cpp | 6 +----- core/test/base/batch_multi_vector.cpp | 6 +----- core/test/base/block_operator.cpp | 7 +------ core/test/base/combination.cpp | 5 +---- core/test/base/composition.cpp | 5 +---- core/test/base/deferred_factory.cpp | 1 - core/test/base/dense_cache.cpp | 6 +----- core/test/base/dim.cpp | 6 ++---- core/test/base/exception.cpp | 5 ++--- core/test/base/exception_helpers.cpp | 5 ++--- core/test/base/executor.cpp | 6 ++---- core/test/base/extended_float.cpp | 2 -- core/test/base/index_range.cpp | 5 ++--- core/test/base/iterator_factory.cpp | 3 --- core/test/base/lin_op.cpp | 6 +----- core/test/base/math.cpp | 6 ++---- core/test/base/matrix_assembly_data.cpp | 5 ++--- core/test/base/matrix_data.cpp | 6 ++---- core/test/base/mtx_io.cpp | 7 +------ core/test/base/perturbation.cpp | 6 ++---- core/test/base/polymorphic_object.cpp | 5 ++--- core/test/base/range.cpp | 6 ++---- core/test/base/range_accessors.cpp | 6 +----- core/test/base/sanitizers.cpp | 1 - core/test/base/segmented_array.cpp | 5 ++--- core/test/base/types.cpp | 7 ++----- core/test/base/utils.cpp | 5 +---- core/test/base/version.cpp | 6 ++---- core/test/components/addressable_pq.cpp | 4 ---- core/test/components/disjoint_sets.cpp | 4 ---- core/test/config/config.cpp | 6 +----- core/test/config/factorization.cpp | 3 --- core/test/config/multigrid.cpp | 3 --- core/test/config/preconditioner.cpp | 3 --- core/test/config/property_tree.cpp | 6 +----- core/test/config/registry.cpp | 6 +----- core/test/config/solver.cpp | 3 --- core/test/config/type_descriptor.cpp | 4 +--- core/test/distributed/index_map.cpp | 6 +----- core/test/factorization/elimination_forest.cpp | 4 ---- core/test/factorization/par_ic.cpp | 6 +----- core/test/factorization/par_ict.cpp | 6 +----- core/test/factorization/par_ilu.cpp | 6 +----- core/test/factorization/par_ilut.cpp | 6 +----- core/test/gtest/environments.hpp | 3 --- core/test/gtest/ginkgo_main.cpp | 1 - core/test/gtest/ginkgo_mpi_main.cpp | 4 ---- core/test/gtest/resources.cpp | 5 ++--- core/test/log/convergence.cpp | 6 +----- core/test/log/logger.cpp | 6 +----- core/test/log/papi.cpp | 7 +------ core/test/log/performance_hint.cpp | 7 +------ core/test/log/profiler_hook.cpp | 8 ++------ core/test/log/record.cpp | 6 +----- core/test/log/stream.cpp | 7 +------ core/test/matrix/batch_csr.cpp | 6 +----- core/test/matrix/batch_dense.cpp | 6 +----- core/test/matrix/batch_ell.cpp | 6 +----- core/test/matrix/batch_identity.cpp | 6 +----- core/test/matrix/coo.cpp | 4 +--- core/test/matrix/coo_builder.cpp | 3 --- core/test/matrix/csr.cpp | 6 +----- core/test/matrix/csr_builder.cpp | 3 --- core/test/matrix/dense.cpp | 6 +----- core/test/matrix/diagonal.cpp | 4 +--- core/test/matrix/ell.cpp | 4 +--- core/test/matrix/fbcsr.cpp | 7 +------ core/test/matrix/fbcsr_builder.cpp | 3 --- core/test/matrix/fbcsr_sample.hpp | 1 - core/test/matrix/hybrid.cpp | 4 +--- core/test/matrix/identity.cpp | 6 +----- core/test/matrix/permutation.cpp | 6 +----- core/test/matrix/row_gatherer.cpp | 6 +----- core/test/matrix/sellp.cpp | 4 +--- core/test/matrix/sparsity_csr.cpp | 7 +------ core/test/mpi/base/bindings.cpp | 3 --- core/test/mpi/base/communicator.cpp | 2 -- core/test/mpi/base/exception_helpers.cpp | 2 -- core/test/mpi/base/polymorphic_object.cpp | 1 - core/test/mpi/base/rank_mapping.cpp | 3 --- core/test/mpi/distributed/helpers.cpp | 5 ++--- core/test/mpi/distributed/matrix.cpp | 2 -- core/test/mpi/distributed/preconditioner/schwarz.cpp | 2 -- core/test/mpi/distributed/solver/multigrid.cpp | 2 -- core/test/multigrid/fixed_coarsening.cpp | 7 +------ core/test/multigrid/pgm.cpp | 7 +------ core/test/preconditioner/batch_jacobi.cpp | 6 +----- core/test/preconditioner/ic.cpp | 3 --- core/test/preconditioner/ilu.cpp | 3 --- core/test/preconditioner/isai.cpp | 7 +------ core/test/preconditioner/jacobi.cpp | 6 +----- core/test/reorder/amd.cpp | 7 +------ core/test/reorder/nested_dissection.cpp | 7 +------ core/test/reorder/rcm.cpp | 7 +------ core/test/reorder/scaled_reordered.cpp | 6 +----- core/test/solver/batch_bicgstab.cpp | 6 +----- core/test/solver/batch_cg.cpp | 6 +----- core/test/solver/bicg.cpp | 7 +------ core/test/solver/bicgstab.cpp | 6 +----- core/test/solver/cb_gmres.cpp | 7 +------ core/test/solver/cg.cpp | 7 +------ core/test/solver/cgs.cpp | 7 +------ core/test/solver/direct.cpp | 7 +------ core/test/solver/fcg.cpp | 6 +----- core/test/solver/gcr.cpp | 7 +------ core/test/solver/gmres.cpp | 7 +------ core/test/solver/idr.cpp | 6 +----- core/test/solver/ir.cpp | 7 +------ core/test/solver/lower_trs.cpp | 3 --- core/test/solver/multigrid.cpp | 7 +------ core/test/solver/upper_trs.cpp | 3 --- core/test/solver/workspace.cpp | 7 +------ core/test/stop/combined.cpp | 6 +----- core/test/stop/criterion.cpp | 5 ++--- core/test/stop/iteration.cpp | 5 ++--- core/test/stop/stopping_status.cpp | 5 ++--- core/test/stop/time.cpp | 6 ++---- core/test/utils.hpp | 3 --- core/test/utils/array_generator.hpp | 1 - core/test/utils/array_generator_test.cpp | 3 --- core/test/utils/assertions.hpp | 3 --- core/test/utils/assertions_test.cpp | 3 --- core/test/utils/batch_helpers.hpp | 2 -- core/test/utils/fb_matrix_generator.hpp | 2 -- core/test/utils/fb_matrix_generator_test.cpp | 3 --- core/test/utils/matrix_generator.hpp | 2 -- core/test/utils/matrix_generator_test.cpp | 3 --- core/test/utils/matrix_utils_test.cpp | 4 ---- core/test/utils/unsort_matrix.hpp | 2 -- core/test/utils/unsort_matrix_test.cpp | 4 ---- core/test/utils/value_generator.hpp | 1 - core/test/utils/value_generator_test.cpp | 3 --- cuda/base/batch_multi_vector_kernels.cu | 3 --- cuda/base/batch_struct.hpp | 1 - cuda/base/config.hpp | 1 - cuda/base/cublas_bindings.hpp | 2 -- cuda/base/curand_bindings.hpp | 2 -- cuda/base/cusparse_bindings.hpp | 2 -- cuda/base/cusparse_block_bindings.hpp | 2 -- cuda/base/cusparse_handle.hpp | 1 - cuda/base/device.cpp | 5 ++--- cuda/base/device_matrix_data_kernels.cu | 2 -- cuda/base/exception.cpp | 5 +---- cuda/base/executor.cpp | 4 ---- cuda/base/index_set_kernels.cpp | 2 -- cuda/base/kernel_config.hpp | 1 - cuda/base/kernel_launch.cuh | 1 - cuda/base/math.hpp | 5 ++--- cuda/base/memory.cpp | 3 --- cuda/base/nvtx.cpp | 1 - cuda/base/pointer_mode_guard.hpp | 4 +--- cuda/base/scoped_device_id.cpp | 7 ++----- cuda/base/stream.cpp | 3 --- cuda/base/thrust.cuh | 1 - cuda/base/timer.cpp | 3 --- cuda/base/types.hpp | 8 ++------ cuda/components/atomic.cuh | 1 - cuda/components/cooperative_groups.cuh | 4 +--- cuda/components/diagonal_block_manipulation.cuh | 1 - cuda/components/format_conversion.cuh | 1 - cuda/components/memory.cuh | 2 -- cuda/components/prefix_sum.cuh | 1 - cuda/components/prefix_sum_kernels.cu | 4 ---- cuda/components/reduction.cuh | 2 -- cuda/components/syncfree.cuh | 1 - cuda/components/warp_blas.cuh | 2 -- cuda/distributed/index_map_kernels.cu | 3 --- cuda/distributed/matrix_kernels.cu | 3 --- cuda/distributed/partition_helpers_kernels.cu | 2 -- cuda/distributed/partition_kernels.cu | 2 -- cuda/distributed/vector_kernels.cu | 3 --- cuda/factorization/cholesky_kernels.cu | 4 ---- cuda/factorization/factorization_kernels.cu | 2 -- cuda/factorization/ic_kernels.cu | 2 -- cuda/factorization/ilu_kernels.cu | 2 -- cuda/factorization/lu_kernels.cu | 4 ---- cuda/factorization/par_ic_kernels.cu | 2 -- cuda/factorization/par_ict_kernels.cu | 2 -- cuda/factorization/par_ilu_kernels.cu | 2 -- cuda/factorization/par_ilut_approx_filter_kernels.cu | 6 +----- cuda/factorization/par_ilut_filter_kernels.cu | 5 +---- cuda/factorization/par_ilut_select_common.cu | 1 - cuda/factorization/par_ilut_select_kernels.cu | 6 +----- cuda/factorization/par_ilut_spgeam_kernels.cu | 5 +---- cuda/factorization/par_ilut_sweep_kernels.cu | 5 +---- cuda/matrix/batch_csr_kernels.cu | 3 --- cuda/matrix/batch_dense_kernels.cu | 3 --- cuda/matrix/batch_ell_kernels.cu | 3 --- cuda/matrix/batch_struct.hpp | 5 +---- cuda/matrix/coo_kernels.cu | 2 -- cuda/matrix/csr_kernels.template.cu | 4 ---- cuda/matrix/dense_kernels.cu | 2 -- cuda/matrix/diagonal_kernels.cu | 2 -- cuda/matrix/ell_kernels.cu | 3 --- cuda/matrix/fbcsr_kernels.template.cu | 4 ---- cuda/matrix/fft_kernels.cu | 3 --- cuda/matrix/sellp_kernels.cu | 2 -- cuda/matrix/sparsity_csr_kernels.cu | 3 --- cuda/multigrid/pgm_kernels.cu | 4 ---- cuda/preconditioner/batch_jacobi_kernels.cu | 2 -- cuda/preconditioner/isai_kernels.cu | 2 -- cuda/preconditioner/jacobi_advanced_apply_kernels.cu | 5 +---- .../jacobi_advanced_apply_kernels.instantiate.cu | 5 +---- cuda/preconditioner/jacobi_generate_kernels.cu | 5 +---- .../jacobi_generate_kernels.instantiate.cu | 5 +---- cuda/preconditioner/jacobi_kernels.cu | 2 -- cuda/preconditioner/jacobi_simple_apply_kernels.cu | 5 +---- .../jacobi_simple_apply_kernels.instantiate.cu | 5 +---- cuda/reorder/rcm_kernels.cu | 3 --- cuda/solver/batch_bicgstab_kernels.cu | 3 --- cuda/solver/batch_cg_kernels.cu | 3 --- cuda/solver/cb_gmres_kernels.cu | 3 --- cuda/solver/common_trs_kernels.cuh | 3 --- cuda/solver/idr_kernels.cu | 3 --- cuda/solver/lower_trs_kernels.cu | 4 ---- cuda/solver/multigrid_kernels.cu | 2 -- cuda/solver/upper_trs_kernels.cu | 4 ---- cuda/stop/criterion_kernels.cu | 2 -- cuda/stop/residual_norm_kernels.cu | 2 -- cuda/test/base/array.cpp | 6 +----- cuda/test/base/cuda_executor.cu | 6 +----- cuda/test/base/cuda_executor_topology.cu | 7 ++----- cuda/test/base/exception_helpers.cu | 8 +++----- cuda/test/base/index_set.cpp | 7 +------ cuda/test/base/kernel_launch.cu | 4 ---- cuda/test/base/lin_op.cpp | 1 - cuda/test/base/math.cu | 8 ++------ cuda/test/base/memory.cpp | 7 +------ cuda/test/base/scoped_device_id.cu | 6 ++---- cuda/test/components/cooperative_groups.cu | 6 ++---- cuda/test/components/merging.cu | 4 ---- cuda/test/components/searching.cu | 4 ---- cuda/test/components/sorting.cu | 4 ---- cuda/test/solver/lower_trs_kernels.cu | 7 ++----- cuda/test/solver/upper_trs_kernels.cu | 7 ++----- cuda/test/utils.hpp | 5 +---- cuda/test/utils/assertions_test.cu | 3 --- devices/device.cpp | 1 - devices/dpcpp/executor.cpp | 2 -- devices/machine_topology.cpp | 1 - devices/omp/executor.cpp | 2 -- dpcpp/base/batch_multi_vector_kernels.dp.cpp | 4 ---- dpcpp/base/batch_struct.hpp | 1 - dpcpp/base/config.hpp | 1 - dpcpp/base/device_matrix_data_kernels.dp.cpp | 4 ---- dpcpp/base/executor.dp.cpp | 3 --- dpcpp/base/helper.dp.cpp | 5 ++--- dpcpp/base/helper.hpp | 3 --- dpcpp/base/index_set_kernels.dp.cpp | 2 -- dpcpp/base/kernel_launch.dp.hpp | 1 - dpcpp/base/kernel_launch_reduction.dp.hpp | 1 - dpcpp/base/onedpl.hpp | 1 - dpcpp/base/onemkl_bindings.hpp | 2 -- dpcpp/base/scoped_device_id.dp.cpp | 1 - dpcpp/base/timer.dp.cpp | 2 -- dpcpp/components/atomic.dp.hpp | 2 -- dpcpp/components/cooperative_groups.dp.hpp | 2 -- dpcpp/components/diagonal_block_manipulation.dp.hpp | 2 -- dpcpp/components/format_conversion.dp.hpp | 3 --- dpcpp/components/intrinsics.dp.hpp | 2 -- dpcpp/components/merging.dp.hpp | 2 -- dpcpp/components/prefix_sum.dp.hpp | 2 -- dpcpp/components/prefix_sum_kernels.dp.cpp | 3 --- dpcpp/components/reduction.dp.hpp | 3 --- dpcpp/components/searching.dp.hpp | 1 - dpcpp/components/segment_scan.dp.hpp | 1 - dpcpp/components/sorting.dp.hpp | 1 - dpcpp/components/thread_ids.dp.hpp | 1 - dpcpp/components/uninitialized_array.hpp | 1 - dpcpp/components/warp_blas.dp.hpp | 3 --- dpcpp/distributed/index_map_kernels.dp.cpp | 1 - dpcpp/distributed/matrix_kernels.dp.cpp | 1 - dpcpp/distributed/partition_helpers_kernels.dp.cpp | 2 -- dpcpp/distributed/partition_kernels.dp.cpp | 3 --- dpcpp/distributed/vector_kernels.dp.cpp | 1 - dpcpp/factorization/cholesky_kernels.dp.cpp | 4 ---- dpcpp/factorization/factorization_kernels.dp.cpp | 3 --- dpcpp/factorization/lu_kernels.dp.cpp | 3 --- dpcpp/factorization/par_ic_kernels.dp.cpp | 3 --- dpcpp/factorization/par_ict_kernels.dp.cpp | 4 ---- dpcpp/factorization/par_ilu_kernels.dp.cpp | 3 --- dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp | 7 +------ dpcpp/factorization/par_ilut_filter_kernel.dp.cpp | 6 +----- dpcpp/factorization/par_ilut_kernels.dp.cpp | 4 ---- dpcpp/factorization/par_ilut_select_common.dp.cpp | 7 ++----- dpcpp/factorization/par_ilut_select_kernel.dp.cpp | 7 +------ dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp | 7 +------ dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp | 6 +----- dpcpp/log/batch_logger.hpp | 1 - dpcpp/matrix/batch_csr_kernels.dp.cpp | 4 ---- dpcpp/matrix/batch_dense_kernels.dp.cpp | 4 ---- dpcpp/matrix/batch_ell_kernels.dp.cpp | 4 ---- dpcpp/matrix/batch_struct.hpp | 5 +---- dpcpp/matrix/coo_kernels.dp.cpp | 3 --- dpcpp/matrix/csr_kernels.dp.cpp | 4 ---- dpcpp/matrix/dense_kernels.dp.cpp | 3 --- dpcpp/matrix/diagonal_kernels.dp.cpp | 3 --- dpcpp/matrix/ell_kernels.dp.cpp | 4 ---- dpcpp/matrix/fbcsr_kernels.dp.cpp | 3 --- dpcpp/matrix/fft_kernels.dp.cpp | 1 - dpcpp/matrix/sellp_kernels.dp.cpp | 3 --- dpcpp/matrix/sparsity_csr_kernels.dp.cpp | 3 --- dpcpp/multigrid/pgm_kernels.dp.cpp | 5 ----- dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp | 2 -- dpcpp/preconditioner/isai_kernels.dp.cpp | 3 --- .../jacobi_advanced_apply_instantiate.inc.dp.cpp | 6 +----- dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp | 5 +---- .../jacobi_generate_instantiate.inc.dp.cpp | 6 +----- dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp | 5 +---- dpcpp/preconditioner/jacobi_kernels.dp.cpp | 3 --- .../jacobi_simple_apply_instantiate.inc.dp.cpp | 6 +----- dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp | 5 +---- dpcpp/reorder/rcm_kernels.dp.cpp | 2 -- dpcpp/solver/batch_bicgstab_kernels.dp.cpp | 3 --- dpcpp/solver/batch_cg_kernels.dp.cpp | 3 --- dpcpp/solver/cb_gmres_kernels.dp.cpp | 4 ---- dpcpp/solver/idr_kernels.dp.cpp | 7 ++----- dpcpp/solver/lower_trs_kernels.dp.cpp | 3 --- dpcpp/solver/multigrid_kernels.dp.cpp | 2 -- dpcpp/solver/upper_trs_kernels.dp.cpp | 3 --- dpcpp/stop/batch_criteria.hpp | 1 - dpcpp/stop/criterion_kernels.dp.cpp | 2 -- dpcpp/stop/residual_norm_kernels.dp.cpp | 3 --- dpcpp/synthesizer/implementation_selection.hpp | 2 -- dpcpp/test/base/dim3.dp.cpp | 2 -- dpcpp/test/base/executor.dp.cpp | 7 +------ dpcpp/test/base/kernel_launch.dp.cpp | 4 ---- dpcpp/test/components/cooperative_groups.dp.cpp | 5 ----- dpcpp/test/matrix/fbcsr_kernels.dp.cpp | 2 -- dpcpp/test/preconditioner/jacobi_kernels.dp.cpp | 7 +------ .../adaptiveprecision-blockjacobi.cpp | 5 ++--- examples/cb-gmres/cb-gmres.cpp | 4 ++-- examples/custom-matrix-format/custom-matrix-format.cpp | 2 +- .../custom-stopping-criterion.cpp | 5 ++--- .../external-lib-interfacing.cpp | 1 - examples/ginkgo-overhead/ginkgo-overhead.cpp | 5 ++--- examples/ginkgo-ranges/ginkgo-ranges.cpp | 3 ++- examples/heat-equation/heat-equation.cpp | 6 ++---- .../ilu-preconditioned-solver.cpp | 5 ++--- examples/inverse-iteration/inverse-iteration.cpp | 5 ++--- .../ir-ilu-preconditioned-solver.cpp | 5 ++--- examples/iterative-refinement/iterative-refinement.cpp | 5 ++--- examples/kokkos-assembly/kokkos-assembly.cpp | 3 +-- examples/minimal-cuda-solver/minimal-cuda-solver.cpp | 3 ++- .../mixed-multigrid-preconditioned-solver.cpp | 5 ++--- .../mixed-multigrid-solver/mixed-multigrid-solver.cpp | 5 ++--- examples/mixed-precision-ir/mixed-precision-ir.cpp | 5 ++--- .../multigrid-preconditioned-solver-customized.cpp | 5 ++--- .../multigrid-preconditioned-solver.cpp | 5 ++--- .../nine-pt-stencil-solver/nine-pt-stencil-solver.cpp | 3 ++- examples/papi-logging/papi-logging.cpp | 8 ++++---- examples/par-ilu-convergence/par-ilu-convergence.cpp | 1 - examples/performance-debugging/performance-debugging.cpp | 5 ++--- examples/poisson-solver/poisson-solver.cpp | 3 ++- examples/preconditioned-solver/preconditioned-solver.cpp | 5 ++--- examples/preconditioner-export/preconditioner-export.cpp | 5 ++--- .../reordered-preconditioned-solver.cpp | 1 - .../schroedinger-splitting/schroedinger-splitting.cpp | 4 ++-- examples/simple-solver-logging/simple-solver-logging.cpp | 5 ++--- .../three-pt-stencil-solver/three-pt-stencil-solver.cpp | 3 ++- extensions/test/config/json_config.cpp | 3 --- extensions/test/kokkos/kokkos_main.cpp | 2 -- extensions/test/kokkos/spaces.cpp | 3 --- extensions/test/kokkos/types.cpp | 4 ---- hip/base/batch_multi_vector_kernels.hip.cpp | 3 --- hip/base/batch_struct.hip.hpp | 1 - hip/base/config.hip.hpp | 1 - hip/base/device.hip.cpp | 2 -- hip/base/device_matrix_data_kernels.hip.cpp | 2 -- hip/base/exception.hip.cpp | 2 -- hip/base/executor.hip.cpp | 3 --- hip/base/hipblas_bindings.hip.hpp | 1 - hip/base/hiprand_bindings.hip.hpp | 1 - hip/base/hipsparse_bindings.hip.hpp | 1 - hip/base/hipsparse_block_bindings.hip.hpp | 1 - hip/base/index_set_kernels.hip.cpp | 2 -- hip/base/kernel_launch.hip.hpp | 1 - hip/base/math.hip.hpp | 5 ++--- hip/base/memory.hip.cpp | 2 -- hip/base/pointer_mode_guard.hip.hpp | 1 - hip/base/roctx.hip.cpp | 1 - hip/base/scoped_device_id.hip.cpp | 5 ++--- hip/base/stream.hip.cpp | 2 -- hip/base/thrust.hip.hpp | 1 - hip/base/timer.hip.cpp | 2 -- hip/base/types.hip.hpp | 8 ++------ hip/components/atomic.hip.hpp | 1 - hip/components/cooperative_groups.hip.hpp | 1 - hip/components/diagonal_block_manipulation.hip.hpp | 1 - hip/components/format_conversion.hip.hpp | 1 - hip/components/memory.hip.hpp | 2 -- hip/components/prefix_sum.hip.hpp | 1 - hip/components/prefix_sum_kernels.hip.cpp | 4 ---- hip/components/reduction.hip.hpp | 2 -- hip/components/syncfree.hip.hpp | 1 - hip/components/warp_blas.hip.hpp | 2 -- hip/distributed/index_map_kernels.hip.cpp | 3 --- hip/distributed/matrix_kernels.hip.cpp | 3 --- hip/distributed/partition_helpers_kernels.hip.cpp | 2 -- hip/distributed/partition_kernels.hip.cpp | 2 -- hip/distributed/vector_kernels.hip.cpp | 3 --- hip/factorization/cholesky_kernels.hip.cpp | 4 ---- hip/factorization/factorization_kernels.hip.cpp | 2 -- hip/factorization/ic_kernels.hip.cpp | 2 -- hip/factorization/ilu_kernels.hip.cpp | 2 -- hip/factorization/lu_kernels.hip.cpp | 4 ---- hip/factorization/par_ic_kernels.hip.cpp | 2 -- hip/factorization/par_ict_kernels.hip.cpp | 2 -- hip/factorization/par_ilu_kernels.hip.cpp | 2 -- hip/factorization/par_ilut_approx_filter_kernels.hip.cpp | 6 +----- hip/factorization/par_ilut_filter_kernels.hip.cpp | 5 +---- hip/factorization/par_ilut_select_common.hip.cpp | 2 -- hip/factorization/par_ilut_select_kernels.hip.cpp | 6 +----- hip/factorization/par_ilut_spgeam_kernels.hip.cpp | 5 +---- hip/factorization/par_ilut_sweep_kernels.hip.cpp | 5 +---- hip/matrix/batch_csr_kernels.hip.cpp | 3 --- hip/matrix/batch_dense_kernels.hip.cpp | 3 --- hip/matrix/batch_ell_kernels.hip.cpp | 3 --- hip/matrix/batch_struct.hip.hpp | 5 +---- hip/matrix/coo_kernels.hip.cpp | 2 -- hip/matrix/csr_kernels.template.hip.cpp | 4 ---- hip/matrix/dense_kernels.hip.cpp | 2 -- hip/matrix/diagonal_kernels.hip.cpp | 2 -- hip/matrix/ell_kernels.hip.cpp | 3 --- hip/matrix/fbcsr_kernels.template.hip.cpp | 4 ---- hip/matrix/fft_kernels.hip.cpp | 2 -- hip/matrix/fft_kernels_stub.hip.cpp | 1 - hip/matrix/sellp_kernels.hip.cpp | 2 -- hip/matrix/sparsity_csr_kernels.hip.cpp | 3 --- hip/multigrid/pgm_kernels.hip.cpp | 4 ---- hip/preconditioner/batch_jacobi_kernels.hip.cpp | 2 -- hip/preconditioner/isai_kernels.hip.cpp | 2 -- hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp | 5 +---- .../jacobi_advanced_apply_kernels.instantiate.hip.cpp | 5 +---- hip/preconditioner/jacobi_generate_kernels.hip.cpp | 5 +---- .../jacobi_generate_kernels.instantiate.hip.cpp | 5 +---- hip/preconditioner/jacobi_kernels.hip.cpp | 2 -- hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp | 5 +---- .../jacobi_simple_apply_kernels.instantiate.hip.cpp | 5 +---- hip/reorder/rcm_kernels.hip.cpp | 3 --- hip/solver/batch_bicgstab_kernels.hip.cpp | 3 --- hip/solver/batch_cg_kernels.hip.cpp | 3 --- hip/solver/cb_gmres_kernels.hip.cpp | 3 --- hip/solver/common_trs_kernels.hip.hpp | 1 - hip/solver/idr_kernels.hip.cpp | 3 --- hip/solver/lower_trs_kernels.hip.cpp | 2 -- hip/solver/multigrid_kernels.hip.cpp | 2 -- hip/solver/upper_trs_kernels.hip.cpp | 2 -- hip/stop/criterion_kernels.hip.cpp | 2 -- hip/stop/residual_norm_kernels.hip.cpp | 2 -- hip/test/base/exception_helpers.hip.cpp | 5 ++--- hip/test/base/hip_executor.hip.cpp | 7 +------ hip/test/base/hip_executor_topology.hip.cpp | 8 ++------ hip/test/base/index_set.cpp | 7 +------ hip/test/base/kernel_launch.hip.cpp | 4 ---- hip/test/base/lin_op.cpp | 5 ++--- hip/test/base/math.hip.cpp | 9 ++------- hip/test/base/memory.cpp | 7 +------ hip/test/base/scoped_device_id.hip.cpp | 6 ++---- hip/test/components/cooperative_groups.hip.cpp | 7 ++----- hip/test/components/merging.hip.cpp | 5 ----- hip/test/components/searching.hip.cpp | 5 ----- hip/test/components/sorting.hip.cpp | 4 ---- hip/test/matrix/fbcsr_kernels.cpp | 8 ++------ hip/test/matrix/fft_kernels.hip.cpp | 6 ++---- hip/test/solver/lower_trs_kernels.cpp | 6 ++---- hip/test/solver/upper_trs_kernels.cpp | 6 ++---- hip/test/utils.hip.hpp | 5 +---- hip/test/utils/assertions_test.cpp | 3 --- include/ginkgo/core/base/abstract_factory.hpp | 1 - include/ginkgo/core/base/array.hpp | 1 - include/ginkgo/core/base/batch_dim.hpp | 1 - include/ginkgo/core/base/batch_lin_op.hpp | 1 - include/ginkgo/core/base/batch_multi_vector.hpp | 1 - include/ginkgo/core/base/combination.hpp | 1 - include/ginkgo/core/base/composition.hpp | 1 - include/ginkgo/core/base/dense_cache.hpp | 1 - include/ginkgo/core/base/device.hpp | 1 - include/ginkgo/core/base/dim.hpp | 1 - include/ginkgo/core/base/exception.hpp | 1 - include/ginkgo/core/base/exception_helpers.hpp | 1 - include/ginkgo/core/base/executor.hpp | 1 - include/ginkgo/core/base/index_set.hpp | 1 - include/ginkgo/core/base/intrinsics.hpp | 1 - include/ginkgo/core/base/lin_op.hpp | 1 - include/ginkgo/core/base/machine_topology.hpp | 1 - include/ginkgo/core/base/math.hpp | 1 - include/ginkgo/core/base/matrix_assembly_data.hpp | 1 - include/ginkgo/core/base/matrix_data.hpp | 1 - include/ginkgo/core/base/mpi.hpp | 1 - include/ginkgo/core/base/mtx_io.hpp | 1 - include/ginkgo/core/base/perturbation.hpp | 1 - include/ginkgo/core/base/polymorphic_object.hpp | 1 - include/ginkgo/core/base/range.hpp | 1 - include/ginkgo/core/base/range_accessors.hpp | 1 - include/ginkgo/core/base/segmented_array.hpp | 1 - include/ginkgo/core/base/temporary_clone.hpp | 1 - include/ginkgo/core/base/temporary_conversion.hpp | 1 - include/ginkgo/core/base/timer.hpp | 1 - include/ginkgo/core/base/utils_helper.hpp | 1 - include/ginkgo/core/base/version.hpp | 1 - include/ginkgo/core/config/config.hpp | 1 - include/ginkgo/core/config/registry.hpp | 1 - include/ginkgo/core/distributed/lin_op.hpp | 1 - include/ginkgo/core/distributed/polymorphic_object.hpp | 1 - include/ginkgo/core/factorization/cholesky.hpp | 1 - include/ginkgo/core/factorization/ic.hpp | 1 - include/ginkgo/core/factorization/ilu.hpp | 1 - include/ginkgo/core/factorization/lu.hpp | 1 - include/ginkgo/core/factorization/par_ic.hpp | 1 - include/ginkgo/core/factorization/par_ict.hpp | 1 - include/ginkgo/core/factorization/par_ilu.hpp | 1 - include/ginkgo/core/factorization/par_ilut.hpp | 1 - include/ginkgo/core/log/batch_logger.hpp | 1 - include/ginkgo/core/log/convergence.hpp | 1 - include/ginkgo/core/log/logger.hpp | 1 - include/ginkgo/core/log/papi.hpp | 2 +- include/ginkgo/core/log/performance_hint.hpp | 1 - include/ginkgo/core/log/profiler_hook.hpp | 1 - include/ginkgo/core/log/record.hpp | 1 - include/ginkgo/core/log/stream.hpp | 1 - include/ginkgo/core/matrix/batch_csr.hpp | 1 - include/ginkgo/core/matrix/batch_dense.hpp | 1 - include/ginkgo/core/matrix/batch_ell.hpp | 1 - include/ginkgo/core/matrix/dense.hpp | 1 - include/ginkgo/core/matrix/hybrid.hpp | 1 - include/ginkgo/core/matrix/permutation.hpp | 1 - include/ginkgo/core/matrix/row_gatherer.hpp | 1 - include/ginkgo/core/matrix/scaled_permutation.hpp | 1 - include/ginkgo/core/matrix/sparsity_csr.hpp | 1 - include/ginkgo/core/multigrid/fixed_coarsening.hpp | 1 - include/ginkgo/core/multigrid/multigrid_level.hpp | 1 - include/ginkgo/core/multigrid/pgm.hpp | 1 - include/ginkgo/core/preconditioner/ic.hpp | 1 - include/ginkgo/core/preconditioner/ilu.hpp | 1 - include/ginkgo/core/preconditioner/isai.hpp | 1 - include/ginkgo/core/preconditioner/utils.hpp | 1 - include/ginkgo/core/reorder/amd.hpp | 1 - include/ginkgo/core/reorder/mc64.hpp | 1 - include/ginkgo/core/reorder/nested_dissection.hpp | 1 - include/ginkgo/core/reorder/rcm.hpp | 1 - include/ginkgo/core/reorder/reordering_base.hpp | 1 - include/ginkgo/core/solver/batch_bicgstab.hpp | 1 - include/ginkgo/core/solver/batch_cg.hpp | 1 - include/ginkgo/core/solver/bicg.hpp | 1 - include/ginkgo/core/solver/bicgstab.hpp | 1 - include/ginkgo/core/solver/cb_gmres.hpp | 1 - include/ginkgo/core/solver/cg.hpp | 1 - include/ginkgo/core/solver/cgs.hpp | 1 - include/ginkgo/core/solver/fcg.hpp | 1 - include/ginkgo/core/solver/gcr.hpp | 1 - include/ginkgo/core/solver/gmres.hpp | 1 - include/ginkgo/core/solver/idr.hpp | 1 - include/ginkgo/core/solver/ir.hpp | 1 - include/ginkgo/core/solver/multigrid.hpp | 1 - include/ginkgo/core/solver/solver_base.hpp | 1 - include/ginkgo/core/solver/triangular.hpp | 1 - include/ginkgo/core/solver/workspace.hpp | 1 - include/ginkgo/core/stop/combined.hpp | 1 - include/ginkgo/core/stop/residual_norm.hpp | 1 - include/ginkgo/core/stop/time.hpp | 1 - include/ginkgo/extensions/config/json_config.hpp | 2 -- include/ginkgo/extensions/kokkos/spaces.hpp | 1 - include/ginkgo/extensions/kokkos/types.hpp | 3 --- omp/base/batch_multi_vector_kernels.cpp | 3 --- omp/base/device_matrix_data_kernels.cpp | 3 --- omp/base/executor.cpp | 1 - omp/base/index_set_kernels.cpp | 3 --- omp/base/kernel_launch.hpp | 1 - omp/base/kernel_launch_reduction.hpp | 1 - omp/base/scoped_device_id.cpp | 1 - omp/components/atomic.hpp | 1 - omp/components/csr_spgeam.hpp | 2 -- omp/components/matrix_operations.hpp | 1 - omp/components/prefix_sum_kernels.cpp | 3 --- omp/components/sort_small.hpp | 1 - omp/distributed/index_map_kernels.cpp | 3 --- omp/distributed/matrix_kernels.cpp | 3 --- omp/distributed/partition_helpers_kernels.cpp | 1 - omp/distributed/partition_kernels.cpp | 3 --- omp/distributed/vector_kernels.cpp | 1 - omp/factorization/cholesky_kernels.cpp | 3 --- omp/factorization/factorization_kernels.cpp | 3 --- omp/factorization/lu_kernels.cpp | 3 --- omp/factorization/par_ic_kernels.cpp | 2 -- omp/factorization/par_ict_kernels.cpp | 3 --- omp/factorization/par_ilu_kernels.cpp | 2 -- omp/factorization/par_ilut_kernels.cpp | 4 ---- omp/matrix/batch_csr_kernels.cpp | 3 --- omp/matrix/batch_dense_kernels.cpp | 3 --- omp/matrix/batch_ell_kernels.cpp | 3 --- omp/matrix/coo_kernels.cpp | 4 ---- omp/matrix/csr_kernels.cpp | 4 ---- omp/matrix/dense_kernels.cpp | 4 ---- omp/matrix/diagonal_kernels.cpp | 2 -- omp/matrix/ell_kernels.cpp | 4 ---- omp/matrix/fbcsr_kernels.cpp | 4 ---- omp/matrix/fft_kernels.cpp | 2 -- omp/matrix/sellp_kernels.cpp | 3 --- omp/matrix/sparsity_csr_kernels.cpp | 4 ---- omp/multigrid/pgm_kernels.cpp | 4 ---- omp/preconditioner/batch_jacobi_kernels.cpp | 1 - omp/preconditioner/isai_kernels.cpp | 4 ---- omp/preconditioner/jacobi_kernels.cpp | 4 ---- omp/reorder/rcm_kernels.cpp | 4 ---- omp/solver/batch_bicgstab_kernels.cpp | 3 --- omp/solver/batch_cg_kernels.cpp | 3 --- omp/solver/cb_gmres_kernels.cpp | 3 --- omp/solver/idr_kernels.cpp | 4 ---- omp/solver/lower_trs_kernels.cpp | 3 --- omp/solver/multigrid_kernels.cpp | 1 - omp/solver/upper_trs_kernels.cpp | 3 --- omp/stop/criterion_kernels.cpp | 1 - omp/stop/residual_norm_kernels.cpp | 2 -- omp/test/base/index_set.cpp | 6 +----- omp/test/base/kernel_launch.cpp | 4 ---- omp/test/matrix/fbcsr_kernels.cpp | 8 ++------ reference/base/batch_multi_vector_kernels.cpp | 3 --- reference/base/batch_struct.hpp | 1 - reference/base/device_matrix_data_kernels.cpp | 3 --- reference/base/index_set_kernels.cpp | 3 --- reference/base/scoped_device_id.cpp | 1 - reference/components/convert_ptrs.hpp | 1 - reference/components/csr_spgeam.hpp | 2 -- reference/components/fill_array_kernels.cpp | 1 - reference/components/format_conversion_kernels.cpp | 2 -- reference/components/precision_conversion_kernels.cpp | 1 - reference/components/reduce_array_kernels.cpp | 1 - reference/distributed/index_map_kernels.cpp | 2 -- reference/distributed/matrix_kernels.cpp | 1 - reference/distributed/partition_helpers.hpp | 1 - reference/distributed/partition_helpers_kernels.cpp | 1 - reference/distributed/vector_kernels.cpp | 1 - reference/factorization/cholesky_kernels.cpp | 3 --- reference/factorization/factorization_kernels.cpp | 3 --- reference/factorization/ic_kernels.cpp | 2 -- reference/factorization/ilu_kernels.cpp | 3 --- reference/factorization/lu_kernels.cpp | 3 --- reference/factorization/par_ic_kernels.cpp | 2 -- reference/factorization/par_ict_kernels.cpp | 3 --- reference/factorization/par_ilu_kernels.cpp | 2 -- reference/factorization/par_ilut_kernels.cpp | 3 --- reference/matrix/batch_csr_kernels.cpp | 3 --- reference/matrix/batch_dense_kernels.cpp | 3 --- reference/matrix/batch_ell_kernels.cpp | 3 --- reference/matrix/batch_struct.hpp | 5 +---- reference/matrix/coo_kernels.cpp | 2 -- reference/matrix/csr_kernels.cpp | 3 --- reference/matrix/dense_kernels.cpp | 3 --- reference/matrix/diagonal_kernels.cpp | 1 - reference/matrix/ell_kernels.cpp | 2 -- reference/matrix/fbcsr_kernels.cpp | 3 --- reference/matrix/fft_kernels.cpp | 2 -- reference/matrix/hybrid_kernels.cpp | 2 -- reference/matrix/scaled_permutation_kernels.cpp | 1 - reference/matrix/sellp_kernels.cpp | 2 -- reference/matrix/sparsity_csr_kernels.cpp | 3 --- reference/multigrid/pgm_kernels.cpp | 3 --- reference/preconditioner/batch_block_jacobi.hpp | 1 - reference/preconditioner/batch_jacobi_kernels.cpp | 1 - reference/preconditioner/isai_kernels.cpp | 3 --- reference/preconditioner/jacobi_kernels.cpp | 3 --- reference/reorder/rcm_kernels.cpp | 3 --- reference/solver/batch_bicgstab_kernels.cpp | 1 - reference/solver/batch_cg_kernels.cpp | 1 - reference/solver/bicg_kernels.cpp | 1 - reference/solver/bicgstab_kernels.cpp | 2 -- reference/solver/cb_gmres_kernels.cpp | 3 --- reference/solver/cg_kernels.cpp | 1 - reference/solver/cgs_kernels.cpp | 1 - reference/solver/common_gmres_kernels.cpp | 2 -- reference/solver/fcg_kernels.cpp | 1 - reference/solver/gcr_kernels.cpp | 1 - reference/solver/gmres_kernels.cpp | 1 - reference/solver/idr_kernels.cpp | 2 -- reference/solver/lower_trs_kernels.cpp | 2 -- reference/solver/multigrid_kernels.cpp | 1 - reference/solver/upper_trs_kernels.cpp | 2 -- reference/stop/criterion_kernels.cpp | 1 - reference/stop/residual_norm_kernels.cpp | 2 -- reference/test/base/array.cpp | 7 +------ reference/test/base/batch_multi_vector_kernels.cpp | 8 ++------ reference/test/base/combination.cpp | 7 +------ reference/test/base/composition.cpp | 7 +------ reference/test/base/index_set.cpp | 7 +------ reference/test/base/perturbation.cpp | 7 +------ reference/test/base/utils.cpp | 4 ---- reference/test/components/absolute_array_kernels.cpp | 4 ---- reference/test/components/fill_array_kernels.cpp | 4 ---- reference/test/components/format_conversion_kernels.cpp | 3 --- .../test/components/precision_conversion_kernels.cpp | 3 --- reference/test/components/prefix_sum_kernels.cpp | 4 ---- reference/test/components/reduce_array_kernels.cpp | 4 ---- reference/test/distributed/index_map_kernels.cpp | 8 ++------ reference/test/distributed/matrix_kernels.cpp | 6 ++---- reference/test/distributed/partition_helpers_kernels.cpp | 6 ++---- reference/test/distributed/partition_kernels.cpp | 8 ++------ reference/test/distributed/vector_kernels.cpp | 6 ++---- reference/test/factorization/cholesky_kernels.cpp | 8 ++------ reference/test/factorization/factorization.cpp | 7 +------ reference/test/factorization/ic_kernels.cpp | 7 +------ reference/test/factorization/ilu_kernels.cpp | 7 +------ reference/test/factorization/lu_kernels.cpp | 8 ++------ reference/test/factorization/par_ic_kernels.cpp | 8 ++------ reference/test/factorization/par_ict_kernels.cpp | 8 ++------ reference/test/factorization/par_ilu_kernels.cpp | 8 ++------ reference/test/factorization/par_ilut_kernels.cpp | 8 ++------ reference/test/log/convergence.cpp | 6 +----- reference/test/log/papi.cpp | 6 +----- reference/test/matrix/batch_csr_kernels.cpp | 8 ++------ reference/test/matrix/batch_dense_kernels.cpp | 8 ++------ reference/test/matrix/batch_ell_kernels.cpp | 8 ++------ reference/test/matrix/coo_kernels.cpp | 8 ++------ reference/test/matrix/csr_kernels.cpp | 8 ++------ reference/test/matrix/dense_kernels.cpp | 8 ++------ reference/test/matrix/diagonal_kernels.cpp | 8 ++------ reference/test/matrix/ell_kernels.cpp | 7 +------ reference/test/matrix/fbcsr_kernels.cpp | 8 ++------ reference/test/matrix/fft_kernels.cpp | 7 +------ reference/test/matrix/hybrid_kernels.cpp | 8 ++------ reference/test/matrix/identity.cpp | 6 +----- reference/test/matrix/permutation.cpp | 7 +------ reference/test/matrix/scaled_permutation.cpp | 7 +------ reference/test/matrix/sellp_kernels.cpp | 7 ++----- reference/test/matrix/sparsity_csr.cpp | 6 +----- reference/test/matrix/sparsity_csr_kernels.cpp | 8 ++------ reference/test/multigrid/fixed_coarsening_kernels.cpp | 7 +------ reference/test/multigrid/pgm_kernels.cpp | 8 ++------ reference/test/preconditioner/batch_jacobi_kernels.cpp | 8 ++------ reference/test/preconditioner/ic.cpp | 7 +------ reference/test/preconditioner/ilu.cpp | 7 +------ reference/test/preconditioner/isai_kernels.cpp | 8 ++------ reference/test/preconditioner/jacobi.cpp | 7 +------ reference/test/preconditioner/jacobi_kernels.cpp | 7 +------ reference/test/reorder/mc64.cpp | 7 +------ reference/test/reorder/mc64_kernels.cpp | 8 ++------ reference/test/reorder/nested_dissection.cpp | 7 ++----- reference/test/reorder/rcm.cpp | 7 +------ reference/test/reorder/rcm_kernels.cpp | 7 +------ reference/test/reorder/scaled_reordered.cpp | 7 +------ reference/test/solver/batch_bicgstab_kernels.cpp | 8 ++------ reference/test/solver/batch_cg_kernels.cpp | 8 ++------ reference/test/solver/bicg_kernels.cpp | 7 ++----- reference/test/solver/bicgstab_kernels.cpp | 7 ++----- reference/test/solver/cb_gmres_kernels.cpp | 7 +------ reference/test/solver/cg_kernels.cpp | 7 ++----- reference/test/solver/cgs_kernels.cpp | 7 ++----- reference/test/solver/direct.cpp | 7 +------ reference/test/solver/fcg_kernels.cpp | 7 ++----- reference/test/solver/gcr_kernels.cpp | 8 ++------ reference/test/solver/gmres_kernels.cpp | 8 ++------ reference/test/solver/idr_kernels.cpp | 6 +----- reference/test/solver/ir_kernels.cpp | 7 ++----- reference/test/solver/lower_trs.cpp | 3 --- reference/test/solver/lower_trs_kernels.cpp | 6 ++---- reference/test/solver/multigrid_kernels.cpp | 6 +----- reference/test/solver/upper_trs.cpp | 3 --- reference/test/solver/upper_trs_kernels.cpp | 6 ++---- reference/test/stop/combined.cpp | 6 ++---- reference/test/stop/criterion_kernels.cpp | 5 +---- reference/test/stop/iteration.cpp | 5 ++--- reference/test/stop/residual_norm_kernels.cpp | 7 +------ reference/test/stop/time.cpp | 5 ++--- reference/test/utils/assertions_test.cpp | 3 --- test/base/batch_multi_vector_kernels.cpp | 8 ++------ test/base/device_matrix_data_kernels.cpp | 8 ++------ test/base/executor.cpp | 6 ++---- test/base/index_range.cpp | 6 ++---- test/base/kernel_launch_generic.cpp | 7 +------ test/base/timer.cpp | 5 +---- test/components/absolute_array_kernels.cpp | 4 ---- test/components/fill_array_kernels.cpp | 4 ---- test/components/format_conversion_kernels.cpp | 3 --- test/components/precision_conversion_kernels.cpp | 3 --- test/components/prefix_sum_kernels.cpp | 4 ---- test/components/reduce_array_kernels.cpp | 4 ---- test/distributed/index_map_kernels.cpp | 4 ---- test/distributed/matrix_kernels.cpp | 4 ---- test/distributed/partition_helper_kernels.cpp | 2 -- test/distributed/partition_kernels.cpp | 4 ---- test/distributed/vector_kernels.cpp | 4 ---- test/factorization/cholesky_kernels.cpp | 4 ---- test/factorization/ic_kernels.cpp | 3 --- test/factorization/ilu_kernels.cpp | 3 --- test/factorization/lu_kernels.cpp | 4 ---- test/factorization/par_ic_kernels.cpp | 4 ---- test/factorization/par_ict_kernels.cpp | 4 ---- test/factorization/par_ilu_kernels.cpp | 4 ---- test/factorization/par_ilut_kernels.cpp | 4 ---- test/log/profiler_hook.cpp | 3 --- test/matrix/batch_csr_kernels.cpp | 4 ---- test/matrix/batch_dense_kernels.cpp | 4 ---- test/matrix/batch_ell_kernels.cpp | 4 ---- test/matrix/coo_kernels.cpp | 4 ---- test/matrix/csr_kernels.cpp | 4 ---- test/matrix/csr_kernels2.cpp | 7 +------ test/matrix/dense_kernels.cpp | 4 ---- test/matrix/diagonal_kernels.cpp | 4 ---- test/matrix/ell_kernels.cpp | 4 ---- test/matrix/fbcsr_kernels.cpp | 4 ---- test/matrix/fft_kernels.cpp | 3 --- test/matrix/hybrid_kernels.cpp | 4 ---- test/matrix/matrix.cpp | 3 --- test/matrix/permutation_kernels.cpp | 3 --- test/matrix/scaled_permutation_kernels.cpp | 3 --- test/matrix/sellp_kernels.cpp | 4 ---- test/matrix/sparsity_csr_kernels.cpp | 4 ---- test/mpi/matrix.cpp | 4 ---- test/mpi/multigrid/pgm.cpp | 4 ---- test/mpi/partition_helpers.cpp | 1 - test/mpi/preconditioner/schwarz.cpp | 4 ---- test/mpi/solver/solver.cpp | 3 --- test/mpi/vector.cpp | 4 ---- test/multigrid/fixed_coarsening_kernels.cpp | 3 --- test/multigrid/pgm_kernels.cpp | 4 ---- test/preconditioner/batch_jacobi_kernels.cpp | 4 ---- test/preconditioner/isai_kernels.cpp | 4 ---- test/preconditioner/jacobi_kernels.cpp | 3 --- test/reorder/amd.cpp | 3 --- test/reorder/mc64.cpp | 2 -- test/reorder/nested_dissection.cpp | 3 --- test/reorder/rcm.cpp | 3 --- test/solver/batch_bicgstab_kernels.cpp | 4 ---- test/solver/batch_cg_kernels.cpp | 4 ---- test/solver/bicg_kernels.cpp | 4 ---- test/solver/bicgstab_kernels.cpp | 4 ---- test/solver/cb_gmres_kernels.cpp | 4 ---- test/solver/cg_kernels.cpp | 4 ---- test/solver/cgs_kernels.cpp | 4 ---- test/solver/direct.cpp | 3 --- test/solver/fcg_kernels.cpp | 4 ---- test/solver/gcr_kernels.cpp | 4 ---- test/solver/gmres_kernels.cpp | 4 ---- test/solver/idr_kernels.cpp | 3 --- test/solver/ir_kernels.cpp | 4 ---- test/solver/lower_trs_kernels.cpp | 3 --- test/solver/multigrid_kernels.cpp | 4 ---- test/solver/solver.cpp | 3 --- test/solver/upper_trs_kernels.cpp | 3 --- test/stop/combined_kernels.cpp | 2 -- test/stop/criterion_kernels.cpp | 2 -- test/stop/residual_norm_kernels.cpp | 2 -- test/test_install/test_install.cpp | 5 ++--- test/tools/resource_file_generator.cpp | 1 - test/utils/executor.hpp | 7 +------ test/utils/mpi/executor.hpp | 7 +------ 1132 files changed, 495 insertions(+), 3259 deletions(-) diff --git a/accessor/accessor_helper.hpp b/accessor/accessor_helper.hpp index 2e2eb20085c..d7c8f1513d9 100644 --- a/accessor/accessor_helper.hpp +++ b/accessor/accessor_helper.hpp @@ -13,7 +13,6 @@ #include #include - #include "index_span.hpp" #include "utils.hpp" diff --git a/accessor/block_col_major.hpp b/accessor/block_col_major.hpp index edd8ee15a61..6ffa7ea789b 100644 --- a/accessor/block_col_major.hpp +++ b/accessor/block_col_major.hpp @@ -8,7 +8,6 @@ #include #include - #include "accessor_helper.hpp" #include "range.hpp" #include "utils.hpp" diff --git a/accessor/cuda_helper.hpp b/accessor/cuda_helper.hpp index 8beb2e8a8df..31d3599516d 100644 --- a/accessor/cuda_helper.hpp +++ b/accessor/cuda_helper.hpp @@ -8,10 +8,8 @@ #include - #include - #include "block_col_major.hpp" #include "reduced_row_major.hpp" #include "row_major.hpp" diff --git a/accessor/hip_helper.hpp b/accessor/hip_helper.hpp index ed1eda37775..6b76b726c10 100644 --- a/accessor/hip_helper.hpp +++ b/accessor/hip_helper.hpp @@ -8,10 +8,8 @@ #include - #include - #include "block_col_major.hpp" #include "reduced_row_major.hpp" #include "row_major.hpp" diff --git a/accessor/math.hpp b/accessor/math.hpp index c2a64f66bc3..0e6cebbb992 100644 --- a/accessor/math.hpp +++ b/accessor/math.hpp @@ -7,7 +7,6 @@ #include - #include "utils.hpp" diff --git a/accessor/range.hpp b/accessor/range.hpp index 7667359f88b..e3e260c8781 100644 --- a/accessor/range.hpp +++ b/accessor/range.hpp @@ -8,7 +8,6 @@ #include - #include "utils.hpp" diff --git a/accessor/reduced_row_major.hpp b/accessor/reduced_row_major.hpp index dd5981fac6a..a9ed30f7b2f 100644 --- a/accessor/reduced_row_major.hpp +++ b/accessor/reduced_row_major.hpp @@ -12,7 +12,6 @@ #include #include - #include "accessor_helper.hpp" #include "index_span.hpp" #include "range.hpp" diff --git a/accessor/reduced_row_major_reference.hpp b/accessor/reduced_row_major_reference.hpp index 5d75146b457..34bb4c14b14 100644 --- a/accessor/reduced_row_major_reference.hpp +++ b/accessor/reduced_row_major_reference.hpp @@ -9,7 +9,6 @@ #include #include - #include "math.hpp" #include "reference_helper.hpp" #include "utils.hpp" diff --git a/accessor/reference_helper.hpp b/accessor/reference_helper.hpp index 2fce0630ba4..a3a77352f8f 100644 --- a/accessor/reference_helper.hpp +++ b/accessor/reference_helper.hpp @@ -9,7 +9,6 @@ #include #include - #include "utils.hpp" diff --git a/accessor/row_major.hpp b/accessor/row_major.hpp index 500138a8ad6..c18f73524bd 100644 --- a/accessor/row_major.hpp +++ b/accessor/row_major.hpp @@ -8,7 +8,6 @@ #include #include - #include "accessor_helper.hpp" #include "range.hpp" #include "utils.hpp" diff --git a/accessor/scaled_reduced_row_major.hpp b/accessor/scaled_reduced_row_major.hpp index f7873640262..9d9f986b0fe 100644 --- a/accessor/scaled_reduced_row_major.hpp +++ b/accessor/scaled_reduced_row_major.hpp @@ -11,7 +11,6 @@ #include #include - #include "accessor_helper.hpp" #include "index_span.hpp" #include "range.hpp" diff --git a/accessor/scaled_reduced_row_major_reference.hpp b/accessor/scaled_reduced_row_major_reference.hpp index 6c18ea970e0..861dbd9a9bf 100644 --- a/accessor/scaled_reduced_row_major_reference.hpp +++ b/accessor/scaled_reduced_row_major_reference.hpp @@ -8,7 +8,6 @@ #include - #include "math.hpp" #include "reference_helper.hpp" #include "utils.hpp" diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp index 0f4dc57e84e..57e0152d824 100644 --- a/benchmark/blas/blas.cpp +++ b/benchmark/blas/blas.cpp @@ -2,13 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include +#include #include "benchmark/blas/blas_common.hpp" #include "benchmark/utils/general.hpp" diff --git a/benchmark/blas/blas_common.hpp b/benchmark/blas/blas_common.hpp index 3a9e123f2e9..c930c8ba5ef 100644 --- a/benchmark/blas/blas_common.hpp +++ b/benchmark/blas/blas_common.hpp @@ -2,9 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include @@ -13,6 +10,7 @@ #include #include +#include #include "benchmark/utils/general.hpp" #include "benchmark/utils/iteration_control.hpp" diff --git a/benchmark/blas/distributed/multi_vector.cpp b/benchmark/blas/distributed/multi_vector.cpp index 054faf50bdc..a4be6c502c1 100644 --- a/benchmark/blas/distributed/multi_vector.cpp +++ b/benchmark/blas/distributed/multi_vector.cpp @@ -2,13 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include +#include + #define GKO_BENCHMARK_DISTRIBUTED diff --git a/benchmark/conversion/conversion.cpp b/benchmark/conversion/conversion.cpp index 5f3212e27ab..17d2ac48e47 100644 --- a/benchmark/conversion/conversion.cpp +++ b/benchmark/conversion/conversion.cpp @@ -2,9 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include @@ -14,6 +11,7 @@ #include #include +#include #include "benchmark/utils/formats.hpp" #include "benchmark/utils/general.hpp" diff --git a/benchmark/matrix_generator/matrix_generator.cpp b/benchmark/matrix_generator/matrix_generator.cpp index b77abc8f1e5..dc1cb3cc08a 100644 --- a/benchmark/matrix_generator/matrix_generator.cpp +++ b/benchmark/matrix_generator/matrix_generator.cpp @@ -2,14 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include +#include #include "benchmark/utils/general.hpp" #include "benchmark/utils/types.hpp" diff --git a/benchmark/matrix_statistics/matrix_statistics.cpp b/benchmark/matrix_statistics/matrix_statistics.cpp index 7228e56d9b4..8eb847f42f2 100644 --- a/benchmark/matrix_statistics/matrix_statistics.cpp +++ b/benchmark/matrix_statistics/matrix_statistics.cpp @@ -2,17 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include +#include #include - #include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" #include "benchmark/utils/runner.hpp" diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp index 5c7f95bfb6b..3c737d67d7b 100644 --- a/benchmark/preconditioner/preconditioner.cpp +++ b/benchmark/preconditioner/preconditioner.cpp @@ -2,15 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include #include +#include #include "benchmark/utils/formats.hpp" #include "benchmark/utils/general.hpp" diff --git a/benchmark/solver/distributed/solver.cpp b/benchmark/solver/distributed/solver.cpp index 18feedc3faa..196bae5331b 100644 --- a/benchmark/solver/distributed/solver.cpp +++ b/benchmark/solver/distributed/solver.cpp @@ -2,14 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include +#include + #define GKO_BENCHMARK_DISTRIBUTED diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp index d1049b538c7..94956cadd21 100644 --- a/benchmark/solver/solver.cpp +++ b/benchmark/solver/solver.cpp @@ -2,9 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include @@ -17,6 +14,7 @@ #include #include +#include #include "benchmark/solver/solver_common.hpp" #include "benchmark/utils/general_matrix.hpp" diff --git a/benchmark/sparse_blas/operations.cpp b/benchmark/sparse_blas/operations.cpp index f5267359068..30f3b5a80fe 100644 --- a/benchmark/sparse_blas/operations.cpp +++ b/benchmark/sparse_blas/operations.cpp @@ -2,14 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "benchmark/sparse_blas/operations.hpp" + #include #include - #include - -#include "benchmark/sparse_blas/operations.hpp" #include "core/base/array_access.hpp" #include "core/factorization/elimination_forest.hpp" #include "core/factorization/symbolic.hpp" diff --git a/benchmark/sparse_blas/operations.hpp b/benchmark/sparse_blas/operations.hpp index 74e217b3605..900ae8037fb 100644 --- a/benchmark/sparse_blas/operations.hpp +++ b/benchmark/sparse_blas/operations.hpp @@ -2,11 +2,9 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include +#include #include "benchmark/utils/json.hpp" #include "benchmark/utils/types.hpp" diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp index becd1643f44..3897689ca11 100644 --- a/benchmark/sparse_blas/sparse_blas.cpp +++ b/benchmark/sparse_blas/sparse_blas.cpp @@ -2,9 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include @@ -15,6 +12,7 @@ #include #include +#include #include "benchmark/sparse_blas/operations.hpp" #include "benchmark/utils/general_matrix.hpp" diff --git a/benchmark/spmv/distributed/spmv.cpp b/benchmark/spmv/distributed/spmv.cpp index 5d12d23857a..2c2e0f57b0e 100644 --- a/benchmark/spmv/distributed/spmv.cpp +++ b/benchmark/spmv/distributed/spmv.cpp @@ -2,9 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include @@ -14,6 +11,8 @@ #include #include +#include + #define GKO_BENCHMARK_DISTRIBUTED diff --git a/benchmark/spmv/spmv.cpp b/benchmark/spmv/spmv.cpp index 5eef78546e9..960921257e3 100644 --- a/benchmark/spmv/spmv.cpp +++ b/benchmark/spmv/spmv.cpp @@ -2,12 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include +#include #include "benchmark/spmv/spmv_common.hpp" #include "benchmark/utils/formats.hpp" diff --git a/benchmark/tools/matrix.cpp b/benchmark/tools/matrix.cpp index c57602baa8a..8bde597797e 100644 --- a/benchmark/tools/matrix.cpp +++ b/benchmark/tools/matrix.cpp @@ -6,11 +6,9 @@ #include #include - #include #include - #include "core/utils/matrix_utils.hpp" diff --git a/benchmark/tools/mtx_to_binary.cpp b/benchmark/tools/mtx_to_binary.cpp index c9d61050a7c..f5ea82804c2 100644 --- a/benchmark/tools/mtx_to_binary.cpp +++ b/benchmark/tools/mtx_to_binary.cpp @@ -6,7 +6,6 @@ #include #include - #include #include diff --git a/benchmark/utils/cuda_linops.cpp b/benchmark/utils/cuda_linops.cpp index a404f9151ea..4683d6086e1 100644 --- a/benchmark/utils/cuda_linops.cpp +++ b/benchmark/utils/cuda_linops.cpp @@ -2,16 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include #include +#include #include "benchmark/utils/sparselib_linops.hpp" #include "benchmark/utils/types.hpp" diff --git a/benchmark/utils/cuda_timer.cpp b/benchmark/utils/cuda_timer.cpp index 9fb5c3889fe..02e4d0016b8 100644 --- a/benchmark/utils/cuda_timer.cpp +++ b/benchmark/utils/cuda_timer.cpp @@ -5,7 +5,6 @@ #include #include - #include "benchmark/utils/timer_impl.hpp" diff --git a/benchmark/utils/dpcpp_linops.dp.cpp b/benchmark/utils/dpcpp_linops.dp.cpp index 7722a20dcf1..f91de85db2a 100644 --- a/benchmark/utils/dpcpp_linops.dp.cpp +++ b/benchmark/utils/dpcpp_linops.dp.cpp @@ -2,14 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include +#include #include "benchmark/utils/sparselib_linops.hpp" #include "benchmark/utils/types.hpp" diff --git a/benchmark/utils/dpcpp_timer.dp.cpp b/benchmark/utils/dpcpp_timer.dp.cpp index bd97593ccb8..c986f2d8fa0 100644 --- a/benchmark/utils/dpcpp_timer.dp.cpp +++ b/benchmark/utils/dpcpp_timer.dp.cpp @@ -4,10 +4,8 @@ #include - #include - #include "benchmark/utils/timer_impl.hpp" diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp index cc609c6a087..13f2cee1056 100644 --- a/benchmark/utils/formats.hpp +++ b/benchmark/utils/formats.hpp @@ -6,16 +6,13 @@ #define GKO_BENCHMARK_UTILS_FORMATS_HPP_ -#include - - #include #include #include - #include +#include #include "benchmark/utils/sparselib_linops.hpp" #include "benchmark/utils/types.hpp" diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index e6137ca6f28..5ae34fa00ab 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -6,9 +6,6 @@ #define GKO_BENCHMARK_UTILS_GENERAL_HPP_ -#include - - #include #include #include @@ -24,13 +21,12 @@ #include #include - #include +#include #include - #include "benchmark/utils/json.hpp" #include "benchmark/utils/timer.hpp" #include "benchmark/utils/types.hpp" diff --git a/benchmark/utils/general_matrix.hpp b/benchmark/utils/general_matrix.hpp index a1f448e8bab..043a09d9994 100644 --- a/benchmark/utils/general_matrix.hpp +++ b/benchmark/utils/general_matrix.hpp @@ -6,11 +6,9 @@ #define GKO_BENCHMARK_UTILS_GENERAL_MATRIX_HPP_ -#include - - #include +#include #include "benchmark/utils/general.hpp" #include "benchmark/utils/generator.hpp" diff --git a/benchmark/utils/generator.hpp b/benchmark/utils/generator.hpp index 6b173651aa3..9ec22a33d1b 100644 --- a/benchmark/utils/generator.hpp +++ b/benchmark/utils/generator.hpp @@ -8,7 +8,6 @@ #include - #include "benchmark/utils/formats.hpp" #include "benchmark/utils/general.hpp" #include "benchmark/utils/loggers.hpp" diff --git a/benchmark/utils/hip_linops.hip.cpp b/benchmark/utils/hip_linops.hip.cpp index f0d7edb45c3..b507a0c441b 100644 --- a/benchmark/utils/hip_linops.hip.cpp +++ b/benchmark/utils/hip_linops.hip.cpp @@ -2,11 +2,9 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include +#include #include "benchmark/utils/sparselib_linops.hpp" #include "benchmark/utils/types.hpp" diff --git a/benchmark/utils/hip_timer.hip.cpp b/benchmark/utils/hip_timer.hip.cpp index 6f114e1d66b..dfc5e8e2b25 100644 --- a/benchmark/utils/hip_timer.hip.cpp +++ b/benchmark/utils/hip_timer.hip.cpp @@ -4,7 +4,6 @@ #include - #include "benchmark/utils/timer_impl.hpp" diff --git a/benchmark/utils/iteration_control.hpp b/benchmark/utils/iteration_control.hpp index ff379ad1dd0..f70d0c88719 100644 --- a/benchmark/utils/iteration_control.hpp +++ b/benchmark/utils/iteration_control.hpp @@ -6,13 +6,11 @@ #define GKO_BENCHMARK_UTILS_ITERATION_CONTROL_HPP_ -#include - - #include #include #include +#include #include "benchmark/utils/general.hpp" #include "benchmark/utils/timer.hpp" diff --git a/benchmark/utils/loggers.hpp b/benchmark/utils/loggers.hpp index 917e7dd5f3d..65d086beecb 100644 --- a/benchmark/utils/loggers.hpp +++ b/benchmark/utils/loggers.hpp @@ -6,15 +6,13 @@ #define GKO_BENCHMARK_UTILS_LOGGERS_HPP_ -#include - - #include #include #include #include #include +#include #include "benchmark/utils/general.hpp" #include "core/distributed/helpers.hpp" diff --git a/benchmark/utils/mpi_timer.cpp b/benchmark/utils/mpi_timer.cpp index 0c4e3cff35b..6ff8510f900 100644 --- a/benchmark/utils/mpi_timer.cpp +++ b/benchmark/utils/mpi_timer.cpp @@ -4,7 +4,6 @@ #include - #include "benchmark/utils/timer_impl.hpp" diff --git a/benchmark/utils/overhead_linop.hpp b/benchmark/utils/overhead_linop.hpp index 9a54ae8cacf..02b52d9e070 100644 --- a/benchmark/utils/overhead_linop.hpp +++ b/benchmark/utils/overhead_linop.hpp @@ -10,12 +10,10 @@ #include #include - #include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/benchmark/utils/preconditioners.hpp b/benchmark/utils/preconditioners.hpp index 26dd257bd04..63fd22708e6 100644 --- a/benchmark/utils/preconditioners.hpp +++ b/benchmark/utils/preconditioners.hpp @@ -6,15 +6,12 @@ #define GKO_BENCHMARK_UTILS_PRECONDITIONERS_HPP_ -#include - - #include #include - #include +#include #include "benchmark/utils/general.hpp" #include "benchmark/utils/overhead_linop.hpp" diff --git a/benchmark/utils/runner.hpp b/benchmark/utils/runner.hpp index a306cd9ac29..2fd1be7874d 100644 --- a/benchmark/utils/runner.hpp +++ b/benchmark/utils/runner.hpp @@ -6,13 +6,11 @@ #define GKO_BENCHMARK_UTILS_RUNNER_HPP_ -#include - - #include #include #include +#include #include "benchmark/utils/general.hpp" diff --git a/benchmark/utils/sparselib_linops.hpp b/benchmark/utils/sparselib_linops.hpp index 80f2115713f..3bdb909b03d 100644 --- a/benchmark/utils/sparselib_linops.hpp +++ b/benchmark/utils/sparselib_linops.hpp @@ -8,7 +8,6 @@ #include - #include #include diff --git a/benchmark/utils/timer.hpp b/benchmark/utils/timer.hpp index 4ad9bbd12a9..27004202107 100644 --- a/benchmark/utils/timer.hpp +++ b/benchmark/utils/timer.hpp @@ -6,14 +6,11 @@ #define GKO_BENCHMARK_UTILS_TIMER_HPP_ -#include - - #include - #include +#include #include "benchmark/utils/timer_impl.hpp" diff --git a/benchmark/utils/timer_impl.hpp b/benchmark/utils/timer_impl.hpp index 1f5fe426df2..4f15c600b12 100644 --- a/benchmark/utils/timer_impl.hpp +++ b/benchmark/utils/timer_impl.hpp @@ -6,12 +6,11 @@ #define GKO_BENCHMARK_UTILS_TIMER_IMPL_HPP_ -#include - - #include #include +#include + class MpiWrappedTimer; diff --git a/benchmark/utils/tuning_variables.cpp b/benchmark/utils/tuning_variables.cpp index facf6d07b9a..1fba6a52924 100644 --- a/benchmark/utils/tuning_variables.cpp +++ b/benchmark/utils/tuning_variables.cpp @@ -2,11 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include "benchmark/utils/tuning_variables.hpp" +#include + namespace gko { diff --git a/benchmark/utils/types.hpp b/benchmark/utils/types.hpp index 03a9ab1d70c..de7a8a0e45e 100644 --- a/benchmark/utils/types.hpp +++ b/benchmark/utils/types.hpp @@ -8,7 +8,6 @@ #include - #include diff --git a/cmake/openmpi_test.cpp b/cmake/openmpi_test.cpp index bfc93e827d4..aba569577d6 100644 --- a/cmake/openmpi_test.cpp +++ b/cmake/openmpi_test.cpp @@ -4,7 +4,6 @@ #include - #include diff --git a/common/cuda_hip/base/thrust.hpp b/common/cuda_hip/base/thrust.hpp index 02aaebc9f3d..365b308850c 100644 --- a/common/cuda_hip/base/thrust.hpp +++ b/common/cuda_hip/base/thrust.hpp @@ -8,7 +8,6 @@ #include - #include #include diff --git a/common/unified/base/device_matrix_data_kernels.cpp b/common/unified/base/device_matrix_data_kernels.cpp index a3f0162c3c8..d801b47fcd5 100644 --- a/common/unified/base/device_matrix_data_kernels.cpp +++ b/common/unified/base/device_matrix_data_kernels.cpp @@ -4,10 +4,8 @@ #include "core/base/device_matrix_data_kernels.hpp" - #include - #include "common/unified/base/kernel_launch.hpp" #include "core/components/fill_array_kernels.hpp" diff --git a/common/unified/base/index_set_kernels.cpp b/common/unified/base/index_set_kernels.cpp index cb8cc72b345..86aff129f00 100644 --- a/common/unified/base/index_set_kernels.cpp +++ b/common/unified/base/index_set_kernels.cpp @@ -4,10 +4,8 @@ #include "core/base/index_set_kernels.hpp" - #include - #include "common/unified/base/kernel_launch.hpp" diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp index 5ca25ecb1e3..fad327ae3b1 100644 --- a/common/unified/base/kernel_launch.hpp +++ b/common/unified/base/kernel_launch.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/common/unified/components/absolute_array_kernels.cpp b/common/unified/components/absolute_array_kernels.cpp index 9ed032f6a6f..c9ab364353c 100644 --- a/common/unified/components/absolute_array_kernels.cpp +++ b/common/unified/components/absolute_array_kernels.cpp @@ -4,7 +4,6 @@ #include "core/components/absolute_array_kernels.hpp" - #include "common/unified/base/kernel_launch.hpp" diff --git a/common/unified/components/fill_array_kernels.cpp b/common/unified/components/fill_array_kernels.cpp index 4586083f821..d78a6e9f346 100644 --- a/common/unified/components/fill_array_kernels.cpp +++ b/common/unified/components/fill_array_kernels.cpp @@ -4,7 +4,6 @@ #include "core/components/fill_array_kernels.hpp" - #include "common/unified/base/kernel_launch.hpp" diff --git a/common/unified/components/format_conversion_kernels.cpp b/common/unified/components/format_conversion_kernels.cpp index b2e48e32a6b..0f54cb04879 100644 --- a/common/unified/components/format_conversion_kernels.cpp +++ b/common/unified/components/format_conversion_kernels.cpp @@ -4,10 +4,8 @@ #include "core/components/format_conversion_kernels.hpp" - #include - #include "common/unified/base/kernel_launch.hpp" #include "core/components/fill_array_kernels.hpp" diff --git a/common/unified/components/precision_conversion_kernels.cpp b/common/unified/components/precision_conversion_kernels.cpp index 10f051c3a75..0402d9bef68 100644 --- a/common/unified/components/precision_conversion_kernels.cpp +++ b/common/unified/components/precision_conversion_kernels.cpp @@ -4,7 +4,6 @@ #include "core/components/precision_conversion_kernels.hpp" - #include "common/unified/base/kernel_launch.hpp" diff --git a/common/unified/components/reduce_array_kernels.cpp b/common/unified/components/reduce_array_kernels.cpp index 7bf1974ccbd..bc8da6fa311 100644 --- a/common/unified/components/reduce_array_kernels.cpp +++ b/common/unified/components/reduce_array_kernels.cpp @@ -4,10 +4,8 @@ #include "core/components/reduce_array_kernels.hpp" - #include - #include "common/unified/base/kernel_launch.hpp" #include "common/unified/base/kernel_launch_reduction.hpp" diff --git a/common/unified/distributed/partition_helpers_kernels.cpp b/common/unified/distributed/partition_helpers_kernels.cpp index ede7bd8be27..3a53157e721 100644 --- a/common/unified/distributed/partition_helpers_kernels.cpp +++ b/common/unified/distributed/partition_helpers_kernels.cpp @@ -4,7 +4,6 @@ #include "core/distributed/partition_helpers_kernels.hpp" - #include "common/unified/base/kernel_launch.hpp" #include "common/unified/base/kernel_launch_reduction.hpp" #include "core/base/array_access.hpp" diff --git a/common/unified/distributed/partition_kernels.cpp b/common/unified/distributed/partition_kernels.cpp index b76a4c690e4..8d6f23101ee 100644 --- a/common/unified/distributed/partition_kernels.cpp +++ b/common/unified/distributed/partition_kernels.cpp @@ -4,7 +4,6 @@ #include "core/distributed/partition_kernels.hpp" - #include "common/unified/base/kernel_launch.hpp" #include "common/unified/base/kernel_launch_reduction.hpp" #include "core/base/array_access.hpp" diff --git a/common/unified/matrix/coo_kernels.cpp b/common/unified/matrix/coo_kernels.cpp index 71277937e20..ce13d7500ab 100644 --- a/common/unified/matrix/coo_kernels.cpp +++ b/common/unified/matrix/coo_kernels.cpp @@ -4,10 +4,8 @@ #include "core/matrix/coo_kernels.hpp" - #include - #include "common/unified/base/kernel_launch.hpp" diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp index 761aefebb82..5236c1c9da9 100644 --- a/common/unified/matrix/csr_kernels.cpp +++ b/common/unified/matrix/csr_kernels.cpp @@ -4,13 +4,10 @@ #include "core/matrix/csr_kernels.hpp" - #include - #include - #include "common/unified/base/kernel_launch.hpp" #include "core/components/prefix_sum_kernels.hpp" diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp index 05966ede9d3..f5b3cc03059 100644 --- a/common/unified/matrix/dense_kernels.template.cpp +++ b/common/unified/matrix/dense_kernels.template.cpp @@ -4,11 +4,9 @@ #include "core/matrix/dense_kernels.hpp" - #include #include - #include "common/unified/base/kernel_launch.hpp" #include "common/unified/base/kernel_launch_reduction.hpp" #include "core/base/array_access.hpp" diff --git a/common/unified/matrix/diagonal_kernels.cpp b/common/unified/matrix/diagonal_kernels.cpp index d1b2dcdb086..dae037a5134 100644 --- a/common/unified/matrix/diagonal_kernels.cpp +++ b/common/unified/matrix/diagonal_kernels.cpp @@ -4,10 +4,8 @@ #include "core/matrix/diagonal_kernels.hpp" - #include - #include "common/unified/base/kernel_launch.hpp" diff --git a/common/unified/matrix/ell_kernels.cpp b/common/unified/matrix/ell_kernels.cpp index 64b0d093591..6d23e08b68b 100644 --- a/common/unified/matrix/ell_kernels.cpp +++ b/common/unified/matrix/ell_kernels.cpp @@ -4,10 +4,8 @@ #include "core/matrix/ell_kernels.hpp" - #include - #include "common/unified/base/kernel_launch.hpp" #include "common/unified/base/kernel_launch_reduction.hpp" #include "core/base/array_access.hpp" diff --git a/common/unified/matrix/hybrid_kernels.cpp b/common/unified/matrix/hybrid_kernels.cpp index 25338bd0b12..8a21a2415f7 100644 --- a/common/unified/matrix/hybrid_kernels.cpp +++ b/common/unified/matrix/hybrid_kernels.cpp @@ -4,7 +4,6 @@ #include "core/matrix/hybrid_kernels.hpp" - #include "common/unified/base/kernel_launch.hpp" #include "core/components/prefix_sum_kernels.hpp" diff --git a/common/unified/matrix/permutation_kernels.cpp b/common/unified/matrix/permutation_kernels.cpp index 7a6b882c754..a1ba9ab54ad 100644 --- a/common/unified/matrix/permutation_kernels.cpp +++ b/common/unified/matrix/permutation_kernels.cpp @@ -4,10 +4,8 @@ #include "core/matrix/permutation_kernels.hpp" - #include - #include "common/unified/base/kernel_launch.hpp" diff --git a/common/unified/matrix/scaled_permutation_kernels.cpp b/common/unified/matrix/scaled_permutation_kernels.cpp index d658f00ca4b..3eaab65e8e6 100644 --- a/common/unified/matrix/scaled_permutation_kernels.cpp +++ b/common/unified/matrix/scaled_permutation_kernels.cpp @@ -4,10 +4,8 @@ #include "core/matrix/scaled_permutation_kernels.hpp" - #include - #include "common/unified/base/kernel_launch.hpp" diff --git a/common/unified/matrix/sellp_kernels.cpp b/common/unified/matrix/sellp_kernels.cpp index af8e463f78c..93b71ff43f2 100644 --- a/common/unified/matrix/sellp_kernels.cpp +++ b/common/unified/matrix/sellp_kernels.cpp @@ -4,10 +4,8 @@ #include "core/matrix/sellp_kernels.hpp" - #include - #include "common/unified/base/kernel_launch.hpp" #include "common/unified/base/kernel_launch_reduction.hpp" #include "core/components/prefix_sum_kernels.hpp" diff --git a/common/unified/matrix/sparsity_csr_kernels.cpp b/common/unified/matrix/sparsity_csr_kernels.cpp index 8e54a14becb..c5a9c79a89b 100644 --- a/common/unified/matrix/sparsity_csr_kernels.cpp +++ b/common/unified/matrix/sparsity_csr_kernels.cpp @@ -4,10 +4,8 @@ #include "core/matrix/sparsity_csr_kernels.hpp" - #include - #include "common/unified/base/kernel_launch.hpp" #include "core/components/prefix_sum_kernels.hpp" diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp index 3c163996565..9ba144cba2e 100644 --- a/common/unified/multigrid/pgm_kernels.cpp +++ b/common/unified/multigrid/pgm_kernels.cpp @@ -4,10 +4,8 @@ #include "core/multigrid/pgm_kernels.hpp" - #include - #include "common/unified/base/kernel_launch.hpp" #include "common/unified/base/kernel_launch_reduction.hpp" #include "core/base/array_access.hpp" diff --git a/common/unified/preconditioner/jacobi_kernels.cpp b/common/unified/preconditioner/jacobi_kernels.cpp index bbc393ddab2..b8c19c24f79 100644 --- a/common/unified/preconditioner/jacobi_kernels.cpp +++ b/common/unified/preconditioner/jacobi_kernels.cpp @@ -4,10 +4,8 @@ #include "core/preconditioner/jacobi_kernels.hpp" - #include - #include "common/unified/base/kernel_launch.hpp" diff --git a/common/unified/solver/bicg_kernels.cpp b/common/unified/solver/bicg_kernels.cpp index 60738c5618c..7d15718c05d 100644 --- a/common/unified/solver/bicg_kernels.cpp +++ b/common/unified/solver/bicg_kernels.cpp @@ -4,10 +4,8 @@ #include "core/solver/bicg_kernels.hpp" - #include - #include "common/unified/base/kernel_launch_solver.hpp" diff --git a/common/unified/solver/bicgstab_kernels.cpp b/common/unified/solver/bicgstab_kernels.cpp index 58a6148c138..b696815f0d4 100644 --- a/common/unified/solver/bicgstab_kernels.cpp +++ b/common/unified/solver/bicgstab_kernels.cpp @@ -4,10 +4,8 @@ #include "core/solver/bicgstab_kernels.hpp" - #include - #include "common/unified/base/kernel_launch_solver.hpp" diff --git a/common/unified/solver/cg_kernels.cpp b/common/unified/solver/cg_kernels.cpp index 37f8c885987..822dddf1c3b 100644 --- a/common/unified/solver/cg_kernels.cpp +++ b/common/unified/solver/cg_kernels.cpp @@ -4,10 +4,8 @@ #include "core/solver/cg_kernels.hpp" - #include - #include "common/unified/base/kernel_launch_solver.hpp" diff --git a/common/unified/solver/cgs_kernels.cpp b/common/unified/solver/cgs_kernels.cpp index a20a3faf3c8..0618b8f8208 100644 --- a/common/unified/solver/cgs_kernels.cpp +++ b/common/unified/solver/cgs_kernels.cpp @@ -4,10 +4,8 @@ #include "core/solver/cgs_kernels.hpp" - #include - #include "common/unified/base/kernel_launch_solver.hpp" diff --git a/common/unified/solver/common_gmres_kernels.cpp b/common/unified/solver/common_gmres_kernels.cpp index 8773cce4e66..0e6ba18bb64 100644 --- a/common/unified/solver/common_gmres_kernels.cpp +++ b/common/unified/solver/common_gmres_kernels.cpp @@ -4,10 +4,8 @@ #include "core/solver/common_gmres_kernels.hpp" - #include - #include "common/unified/base/kernel_launch.hpp" #include "core/solver/cb_gmres_kernels.hpp" diff --git a/common/unified/solver/fcg_kernels.cpp b/common/unified/solver/fcg_kernels.cpp index cbe23526c09..7853d97c358 100644 --- a/common/unified/solver/fcg_kernels.cpp +++ b/common/unified/solver/fcg_kernels.cpp @@ -4,10 +4,8 @@ #include "core/solver/fcg_kernels.hpp" - #include - #include "common/unified/base/kernel_launch_solver.hpp" diff --git a/common/unified/solver/gcr_kernels.cpp b/common/unified/solver/gcr_kernels.cpp index 57422ce9954..0c9e825228a 100644 --- a/common/unified/solver/gcr_kernels.cpp +++ b/common/unified/solver/gcr_kernels.cpp @@ -4,10 +4,8 @@ #include "core/solver/gcr_kernels.hpp" - #include - #include "common/unified/base/kernel_launch_solver.hpp" diff --git a/common/unified/solver/gmres_kernels.cpp b/common/unified/solver/gmres_kernels.cpp index 5c0cd52bdcf..3997963f8d7 100644 --- a/common/unified/solver/gmres_kernels.cpp +++ b/common/unified/solver/gmres_kernels.cpp @@ -4,11 +4,9 @@ #include "core/solver/gmres_kernels.hpp" - #include #include - #include "common/unified/base/kernel_launch.hpp" diff --git a/common/unified/solver/ir_kernels.cpp b/common/unified/solver/ir_kernels.cpp index 1e95206557e..96f0731f039 100644 --- a/common/unified/solver/ir_kernels.cpp +++ b/common/unified/solver/ir_kernels.cpp @@ -4,7 +4,6 @@ #include "core/solver/ir_kernels.hpp" - #include "common/unified/base/kernel_launch.hpp" diff --git a/core/base/allocator.hpp b/core/base/allocator.hpp index 0bfbd0158c3..e45c4aef813 100644 --- a/core/base/allocator.hpp +++ b/core/base/allocator.hpp @@ -15,7 +15,6 @@ #include #include - #include diff --git a/core/base/array.cpp b/core/base/array.cpp index f529e3cf9d2..a41f7c07e55 100644 --- a/core/base/array.cpp +++ b/core/base/array.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/base/array.hpp" - #include - #include "core/base/array_access.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/precision_conversion_kernels.hpp" diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index 960158654f2..f4485377f25 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -4,11 +4,9 @@ #include "ginkgo/core/base/batch_multi_vector.hpp" - #include #include - #include #include #include @@ -18,7 +16,6 @@ #include #include - #include "core/base/batch_multi_vector_kernels.hpp" diff --git a/core/base/batch_multi_vector_kernels.hpp b/core/base/batch_multi_vector_kernels.hpp index 9a3618c06fb..45a9bf7e2d3 100644 --- a/core/base/batch_multi_vector_kernels.hpp +++ b/core/base/batch_multi_vector_kernels.hpp @@ -7,12 +7,9 @@ #include - - #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp index 877ed926101..6964eeee544 100644 --- a/core/base/batch_utilities.hpp +++ b/core/base/batch_utilities.hpp @@ -9,7 +9,6 @@ #include #include - #include #include #include diff --git a/core/base/block_operator.cpp b/core/base/block_operator.cpp index 43ac79c3c0e..f53375301a8 100644 --- a/core/base/block_operator.cpp +++ b/core/base/block_operator.cpp @@ -4,14 +4,11 @@ #include "ginkgo/core/base/block_operator.hpp" - #include - #include #include - #include "core/base/dispatch_helper.hpp" diff --git a/core/base/combination.cpp b/core/base/combination.cpp index 324fa8d4ddf..3b30b77d38c 100644 --- a/core/base/combination.cpp +++ b/core/base/combination.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/base/combination.hpp" - #include #include diff --git a/core/base/composition.cpp b/core/base/composition.cpp index 515fb425633..82c8152300b 100644 --- a/core/base/composition.cpp +++ b/core/base/composition.cpp @@ -4,15 +4,12 @@ #include "ginkgo/core/base/composition.hpp" - #include #include - #include #include - #include "core/components/fill_array_kernels.hpp" diff --git a/core/base/dense_cache.cpp b/core/base/dense_cache.cpp index 50e1abc3977..6adbb6107c9 100644 --- a/core/base/dense_cache.cpp +++ b/core/base/dense_cache.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/base/dense_cache.hpp" - #include diff --git a/core/base/device_matrix_data.cpp b/core/base/device_matrix_data.cpp index 085054cbd69..a2e5d6e7044 100644 --- a/core/base/device_matrix_data.cpp +++ b/core/base/device_matrix_data.cpp @@ -4,12 +4,10 @@ #include "ginkgo/core/base/device_matrix_data.hpp" - #include #include #include - #include "core/base/device_matrix_data_kernels.hpp" diff --git a/core/base/device_matrix_data_kernels.hpp b/core/base/device_matrix_data_kernels.hpp index 2c7d2a81225..bcaeebdf0cb 100644 --- a/core/base/device_matrix_data_kernels.hpp +++ b/core/base/device_matrix_data_kernels.hpp @@ -6,16 +6,12 @@ #define GKO_CORE_BASE_DEVICE_MATRIX_DATA_KERNELS_HPP_ -#include - - #include - +#include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/base/dispatch_helper.hpp b/core/base/dispatch_helper.hpp index 36c664d80ff..169b907775b 100644 --- a/core/base/dispatch_helper.hpp +++ b/core/base/dispatch_helper.hpp @@ -8,7 +8,6 @@ #include - #include diff --git a/core/base/executor.cpp b/core/base/executor.cpp index 1fb1703c56f..65019efe94c 100644 --- a/core/base/executor.cpp +++ b/core/base/executor.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/base/executor.hpp" - #include #include #include diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index e2d4e01a7d4..2dc60afd329 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -9,7 +9,6 @@ #include #include - #include #include diff --git a/core/base/index_range.hpp b/core/base/index_range.hpp index 2330cc5b43c..ca972363b4a 100644 --- a/core/base/index_range.hpp +++ b/core/base/index_range.hpp @@ -10,7 +10,6 @@ #include #include - #include "core/base/iterator_range.hpp" diff --git a/core/base/index_set.cpp b/core/base/index_set.cpp index b27d3803448..715916aa37a 100644 --- a/core/base/index_set.cpp +++ b/core/base/index_set.cpp @@ -4,17 +4,14 @@ #include "ginkgo/core/base/index_set.hpp" - #include #include #include #include - #include #include - #include "core/base/array_access.hpp" #include "core/base/index_set_kernels.hpp" diff --git a/core/base/index_set_kernels.hpp b/core/base/index_set_kernels.hpp index 63170d0e853..7e742ea062f 100644 --- a/core/base/index_set_kernels.hpp +++ b/core/base/index_set_kernels.hpp @@ -6,11 +6,9 @@ #define GKO_CORE_BASE_INDEX_SET_KERNELS_HPP_ -#include - - #include +#include #include "core/base/kernel_declaration.hpp" diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp index 366f1b3bf60..3d224836b1a 100644 --- a/core/base/iterator_factory.hpp +++ b/core/base/iterator_factory.hpp @@ -13,7 +13,6 @@ #include #include - #include "core/base/copy_assignable.hpp" diff --git a/core/base/memory.cpp b/core/base/memory.cpp index 0b3e0ce833b..a9a07a74e6a 100644 --- a/core/base/memory.cpp +++ b/core/base/memory.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/base/memory.hpp" - #include - #include diff --git a/core/base/mpi.cpp b/core/base/mpi.cpp index 0a703675158..652ef8662e5 100644 --- a/core/base/mpi.cpp +++ b/core/base/mpi.cpp @@ -10,7 +10,6 @@ #include - #include diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp index e2f2dbf5d9b..c264a073f31 100644 --- a/core/base/mtx_io.cpp +++ b/core/base/mtx_io.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/base/mtx_io.hpp" - #include #include #include @@ -14,7 +13,6 @@ #include #include - #include #include #include diff --git a/core/base/perturbation.cpp b/core/base/perturbation.cpp index 94a4975cfa0..686c54e5b2d 100644 --- a/core/base/perturbation.cpp +++ b/core/base/perturbation.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/base/perturbation.hpp" - #include #include diff --git a/core/base/segmented_array.cpp b/core/base/segmented_array.cpp index cc31abb7686..4a88d42128f 100644 --- a/core/base/segmented_array.cpp +++ b/core/base/segmented_array.cpp @@ -4,7 +4,6 @@ #include - #include "core/base/array_access.hpp" #include "core/components/prefix_sum_kernels.hpp" diff --git a/core/base/timer.cpp b/core/base/timer.cpp index abd5fbf61cd..9050f00fa17 100644 --- a/core/base/timer.cpp +++ b/core/base/timer.cpp @@ -4,16 +4,13 @@ #include "ginkgo/core/base/timer.hpp" - #include #include #include - #include #include - #include "cuda/base/device.hpp" #include "dpcpp/base/device.hpp" #include "hip/base/device.hpp" diff --git a/core/base/utils.hpp b/core/base/utils.hpp index fee0a300c16..061c6e303ed 100644 --- a/core/base/utils.hpp +++ b/core/base/utils.hpp @@ -6,15 +6,12 @@ #define GKO_CORE_BASE_UTILS_HPP_ -#include - - #include #include - #include #include +#include #include diff --git a/core/base/workspace_aliases.hpp b/core/base/workspace_aliases.hpp index af1391300f4..ddea34a71e9 100644 --- a/core/base/workspace_aliases.hpp +++ b/core/base/workspace_aliases.hpp @@ -11,7 +11,6 @@ #include #include - #include #include diff --git a/core/components/absolute_array_kernels.hpp b/core/components/absolute_array_kernels.hpp index 18f32ddc3e6..7617883cd1c 100644 --- a/core/components/absolute_array_kernels.hpp +++ b/core/components/absolute_array_kernels.hpp @@ -8,12 +8,10 @@ #include - #include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/components/addressable_pq.hpp b/core/components/addressable_pq.hpp index d4e1f20fc80..e5b5a3e0fbe 100644 --- a/core/components/addressable_pq.hpp +++ b/core/components/addressable_pq.hpp @@ -9,11 +9,9 @@ #include #include - #include #include - #include "core/base/allocator.hpp" diff --git a/core/components/fill_array_kernels.hpp b/core/components/fill_array_kernels.hpp index 4a6d8c6a3d1..2608cabe409 100644 --- a/core/components/fill_array_kernels.hpp +++ b/core/components/fill_array_kernels.hpp @@ -8,11 +8,9 @@ #include - #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/components/format_conversion_kernels.hpp b/core/components/format_conversion_kernels.hpp index 2a72da998f9..10be3a10232 100644 --- a/core/components/format_conversion_kernels.hpp +++ b/core/components/format_conversion_kernels.hpp @@ -8,12 +8,10 @@ #include - #include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/components/precision_conversion_kernels.hpp b/core/components/precision_conversion_kernels.hpp index 0839530a92c..8443a657502 100644 --- a/core/components/precision_conversion_kernels.hpp +++ b/core/components/precision_conversion_kernels.hpp @@ -8,12 +8,10 @@ #include - #include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/components/prefix_sum_kernels.hpp b/core/components/prefix_sum_kernels.hpp index e43e2cf042c..8b68b54e29f 100644 --- a/core/components/prefix_sum_kernels.hpp +++ b/core/components/prefix_sum_kernels.hpp @@ -8,11 +8,9 @@ #include - #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/components/reduce_array_kernels.hpp b/core/components/reduce_array_kernels.hpp index 2d02906ebdc..b124e6ec2e3 100644 --- a/core/components/reduce_array_kernels.hpp +++ b/core/components/reduce_array_kernels.hpp @@ -8,12 +8,10 @@ #include - #include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/config/config.cpp b/core/config/config.cpp index 87dd49b6c03..adb47e5ef75 100644 --- a/core/config/config.cpp +++ b/core/config/config.cpp @@ -4,14 +4,11 @@ #include "ginkgo/core/config/config.hpp" - #include - #include #include - #include "core/config/config_helper.hpp" #include "core/config/registry_accessor.hpp" diff --git a/core/config/config_helper.cpp b/core/config/config_helper.cpp index c12143ff8d6..30b33063413 100644 --- a/core/config/config_helper.cpp +++ b/core/config/config_helper.cpp @@ -2,15 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "core/config/config_helper.hpp" +#include #include #include #include - -#include "core/config/config_helper.hpp" #include "core/config/registry_accessor.hpp" #include "core/config/stop_config.hpp" diff --git a/core/config/config_helper.hpp b/core/config/config_helper.hpp index 0866cf9695a..f84e6799bf7 100644 --- a/core/config/config_helper.hpp +++ b/core/config/config_helper.hpp @@ -9,7 +9,6 @@ #include #include - #include #include #include @@ -18,7 +17,6 @@ #include #include - #include "core/config/registry_accessor.hpp" diff --git a/core/config/dispatch.hpp b/core/config/dispatch.hpp index 5bf5dc3273e..0138665aac2 100644 --- a/core/config/dispatch.hpp +++ b/core/config/dispatch.hpp @@ -9,7 +9,6 @@ #include #include - #include #include #include @@ -17,7 +16,6 @@ #include #include - #include "core/config/config_helper.hpp" #include "core/config/type_descriptor_helper.hpp" diff --git a/core/config/factorization_config.cpp b/core/config/factorization_config.cpp index df6439d1297..259d32cb872 100644 --- a/core/config/factorization_config.cpp +++ b/core/config/factorization_config.cpp @@ -14,7 +14,6 @@ #include #include - #include "core/config/config_helper.hpp" #include "core/config/dispatch.hpp" #include "core/config/parse_macro.hpp" diff --git a/core/config/multigrid_config.cpp b/core/config/multigrid_config.cpp index 553e6ca033d..83be1a1742b 100644 --- a/core/config/multigrid_config.cpp +++ b/core/config/multigrid_config.cpp @@ -2,10 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "ginkgo/core/multigrid/pgm.hpp" - - #include "core/config/parse_macro.hpp" +#include "ginkgo/core/multigrid/pgm.hpp" namespace gko { diff --git a/core/config/parse_macro.hpp b/core/config/parse_macro.hpp index cbc9438fbb7..800b42f9493 100644 --- a/core/config/parse_macro.hpp +++ b/core/config/parse_macro.hpp @@ -10,7 +10,6 @@ #include #include - #include "core/config/config_helper.hpp" #include "core/config/dispatch.hpp" #include "core/config/type_descriptor_helper.hpp" diff --git a/core/config/preconditioner_config.cpp b/core/config/preconditioner_config.cpp index e5f8ee94ea6..cba54cb3356 100644 --- a/core/config/preconditioner_config.cpp +++ b/core/config/preconditioner_config.cpp @@ -13,7 +13,6 @@ #include #include - #include "core/config/config_helper.hpp" #include "core/config/dispatch.hpp" #include "core/config/parse_macro.hpp" diff --git a/core/config/property_tree.cpp b/core/config/property_tree.cpp index 1ab33712953..3f6826bf634 100644 --- a/core/config/property_tree.cpp +++ b/core/config/property_tree.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/config/property_tree.hpp" - #include diff --git a/core/config/registry.cpp b/core/config/registry.cpp index 8b8bdbcaf0d..1718de5fed2 100644 --- a/core/config/registry.cpp +++ b/core/config/registry.cpp @@ -4,11 +4,9 @@ #include "ginkgo/core/config/registry.hpp" - #include #include - #include "core/config/config_helper.hpp" diff --git a/core/config/registry_accessor.hpp b/core/config/registry_accessor.hpp index 002e6245811..5b007632f0c 100644 --- a/core/config/registry_accessor.hpp +++ b/core/config/registry_accessor.hpp @@ -8,7 +8,6 @@ #include - #include diff --git a/core/config/solver_config.cpp b/core/config/solver_config.cpp index 27c06f7f895..b35a639b8e7 100644 --- a/core/config/solver_config.cpp +++ b/core/config/solver_config.cpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/config/solver_config.hpp" + #include #include #include @@ -19,11 +21,9 @@ #include #include - #include "core/config/config_helper.hpp" #include "core/config/dispatch.hpp" #include "core/config/parse_macro.hpp" -#include "core/config/solver_config.hpp" namespace gko { diff --git a/core/config/solver_config.hpp b/core/config/solver_config.hpp index 3c820541f2c..e5f51ff85f4 100644 --- a/core/config/solver_config.hpp +++ b/core/config/solver_config.hpp @@ -9,7 +9,6 @@ #include #include - #include "core/config/config_helper.hpp" #include "core/config/dispatch.hpp" diff --git a/core/config/stop_config.cpp b/core/config/stop_config.cpp index 2270cf5f84c..4623eb768fc 100644 --- a/core/config/stop_config.cpp +++ b/core/config/stop_config.cpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/config/stop_config.hpp" + #include #include #include @@ -12,11 +14,9 @@ #include #include - #include "core/config/config_helper.hpp" #include "core/config/dispatch.hpp" #include "core/config/registry_accessor.hpp" -#include "core/config/stop_config.hpp" #include "core/config/type_descriptor_helper.hpp" diff --git a/core/config/trisolver_config.hpp b/core/config/trisolver_config.hpp index 301109cab6b..8d7f6fb680d 100644 --- a/core/config/trisolver_config.hpp +++ b/core/config/trisolver_config.hpp @@ -10,7 +10,6 @@ #include #include - #include "core/config/config_helper.hpp" #include "core/config/dispatch.hpp" diff --git a/core/config/type_descriptor.cpp b/core/config/type_descriptor.cpp index cbc29c5088a..93ec1d3f929 100644 --- a/core/config/type_descriptor.cpp +++ b/core/config/type_descriptor.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/config/type_descriptor.hpp" - #include - #include "core/config/type_descriptor_helper.hpp" diff --git a/core/config/type_descriptor_helper.hpp b/core/config/type_descriptor_helper.hpp index 3917e317773..0edc4376f1a 100644 --- a/core/config/type_descriptor_helper.hpp +++ b/core/config/type_descriptor_helper.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index f26b221a799..f5dc92ce16e 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -5,7 +5,6 @@ #include #include - #include "core/base/batch_multi_vector_kernels.hpp" #include "core/base/device_matrix_data_kernels.hpp" #include "core/base/index_set_kernels.hpp" diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp index 15bb5d05735..abda9e4e0f6 100644 --- a/core/device_hooks/cuda_hooks.cpp +++ b/core/device_hooks/cuda_hooks.cpp @@ -4,7 +4,6 @@ #include - #include #include #include diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp index 4be09fc60a2..6cd86581998 100644 --- a/core/device_hooks/dpcpp_hooks.cpp +++ b/core/device_hooks/dpcpp_hooks.cpp @@ -5,7 +5,6 @@ #include #include - #include #include #include diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp index a90691e1af4..573fb37b8f0 100644 --- a/core/device_hooks/hip_hooks.cpp +++ b/core/device_hooks/hip_hooks.cpp @@ -5,7 +5,6 @@ #include #include - #include #include #include diff --git a/core/distributed/helpers.hpp b/core/distributed/helpers.hpp index 9edf8282ed9..5536dbe32f0 100644 --- a/core/distributed/helpers.hpp +++ b/core/distributed/helpers.hpp @@ -8,13 +8,11 @@ #include - #include #include #include #include - #include "core/base/dispatch_helper.hpp" diff --git a/core/distributed/index_map.cpp b/core/distributed/index_map.cpp index e24d8141b4d..9f0ed8137ba 100644 --- a/core/distributed/index_map.cpp +++ b/core/distributed/index_map.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/distributed/index_map.hpp" - #include "core/distributed/index_map_kernels.hpp" diff --git a/core/distributed/index_map_kernels.hpp b/core/distributed/index_map_kernels.hpp index c808a4b8d19..4694ba6cc10 100644 --- a/core/distributed/index_map_kernels.hpp +++ b/core/distributed/index_map_kernels.hpp @@ -6,14 +6,11 @@ #define GKO_CORE_DISTRIBUTED_INDEX_MAP_KERNELS_HPP_ -#include - - #include #include +#include #include - #include "core/base/kernel_declaration.hpp" #include "core/base/segmented_array.hpp" diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 2d2d1304769..8eee020a3e6 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -4,13 +4,11 @@ #include "ginkgo/core/distributed/matrix.hpp" - #include #include #include #include - #include "core/distributed/matrix_kernels.hpp" diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp index a424c49c442..f24e8c9945e 100644 --- a/core/distributed/matrix_kernels.hpp +++ b/core/distributed/matrix_kernels.hpp @@ -12,7 +12,6 @@ #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/distributed/partition.cpp b/core/distributed/partition.cpp index 5e6903de872..763986f3a86 100644 --- a/core/distributed/partition.cpp +++ b/core/distributed/partition.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/distributed/partition.hpp" - #include "core/base/array_access.hpp" #include "core/distributed/partition_kernels.hpp" diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp index 1a55daf8134..75c3d14f971 100644 --- a/core/distributed/partition_helpers.cpp +++ b/core/distributed/partition_helpers.cpp @@ -4,13 +4,10 @@ #include "ginkgo/core/distributed/partition_helpers.hpp" - #include - #include - #include "core/components/fill_array_kernels.hpp" #include "core/distributed/partition_helpers_kernels.hpp" diff --git a/core/distributed/partition_helpers_kernels.hpp b/core/distributed/partition_helpers_kernels.hpp index 9c87629b60c..0262c3007ee 100644 --- a/core/distributed/partition_helpers_kernels.hpp +++ b/core/distributed/partition_helpers_kernels.hpp @@ -8,7 +8,6 @@ #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/distributed/partition_kernels.hpp b/core/distributed/partition_kernels.hpp index fd7e214dbbd..b1df933e5c8 100644 --- a/core/distributed/partition_kernels.hpp +++ b/core/distributed/partition_kernels.hpp @@ -8,7 +8,6 @@ #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp index 2def0a0f85c..7235038847d 100644 --- a/core/distributed/preconditioner/schwarz.cpp +++ b/core/distributed/preconditioner/schwarz.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/distributed/preconditioner/schwarz.hpp" - #include - #include #include #include @@ -18,7 +16,6 @@ #include #include - #include "core/base/utils.hpp" #include "core/distributed/helpers.hpp" diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index 2e57fcf7451..ae7ab182a85 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/distributed/vector.hpp" - #include - #include "core/distributed/vector_kernels.hpp" #include "core/matrix/dense_kernels.hpp" diff --git a/core/distributed/vector_kernels.hpp b/core/distributed/vector_kernels.hpp index 1d5fcb6a51e..c288b8918a1 100644 --- a/core/distributed/vector_kernels.hpp +++ b/core/distributed/vector_kernels.hpp @@ -13,7 +13,6 @@ #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/factorization/cholesky.cpp b/core/factorization/cholesky.cpp index 12456df4abc..81627ad229b 100644 --- a/core/factorization/cholesky.cpp +++ b/core/factorization/cholesky.cpp @@ -4,14 +4,12 @@ #include "ginkgo/core/factorization/cholesky.hpp" - #include #include #include #include #include - #include "core/base/array_access.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/config/config_helper.hpp" diff --git a/core/factorization/cholesky_kernels.hpp b/core/factorization/cholesky_kernels.hpp index ff758d988db..db889ce1162 100644 --- a/core/factorization/cholesky_kernels.hpp +++ b/core/factorization/cholesky_kernels.hpp @@ -8,12 +8,10 @@ #include - #include #include #include - #include "core/base/kernel_declaration.hpp" #include "core/factorization/elimination_forest.hpp" diff --git a/core/factorization/elimination_forest.cpp b/core/factorization/elimination_forest.cpp index 138db0f6350..1dc8ff060a0 100644 --- a/core/factorization/elimination_forest.cpp +++ b/core/factorization/elimination_forest.cpp @@ -4,7 +4,6 @@ #include "core/factorization/elimination_forest.hpp" - #include diff --git a/core/factorization/elimination_forest.hpp b/core/factorization/elimination_forest.hpp index 7ab7f9c715b..5307a90384c 100644 --- a/core/factorization/elimination_forest.hpp +++ b/core/factorization/elimination_forest.hpp @@ -10,7 +10,6 @@ #include #include - #include "core/components/disjoint_sets.hpp" diff --git a/core/factorization/factorization.cpp b/core/factorization/factorization.cpp index 597fc7b48f4..1df1f49aa13 100644 --- a/core/factorization/factorization.cpp +++ b/core/factorization/factorization.cpp @@ -4,12 +4,10 @@ #include "ginkgo/core/factorization/factorization.hpp" - #include #include #include - #include "core/base/array_access.hpp" #include "core/factorization/factorization_kernels.hpp" diff --git a/core/factorization/factorization_kernels.hpp b/core/factorization/factorization_kernels.hpp index 0dc0f9fc16e..bab3dd16bd2 100644 --- a/core/factorization/factorization_kernels.hpp +++ b/core/factorization/factorization_kernels.hpp @@ -8,12 +8,10 @@ #include - #include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/factorization/ic.cpp b/core/factorization/ic.cpp index 67fb3df5b46..2257e6256e4 100644 --- a/core/factorization/ic.cpp +++ b/core/factorization/ic.cpp @@ -4,17 +4,14 @@ #include "ginkgo/core/factorization/ic.hpp" - #include - #include #include #include #include #include - #include "core/base/array_access.hpp" #include "core/config/config_helper.hpp" #include "core/factorization/factorization_kernels.hpp" diff --git a/core/factorization/ic_kernels.hpp b/core/factorization/ic_kernels.hpp index 29d400ad183..187e6cf0e6d 100644 --- a/core/factorization/ic_kernels.hpp +++ b/core/factorization/ic_kernels.hpp @@ -6,16 +6,12 @@ #define GKO_CORE_FACTORIZATION_IC_KERNELS_HPP_ -#include - - #include - #include +#include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp index 15f3cef1831..41df4065979 100644 --- a/core/factorization/ilu.cpp +++ b/core/factorization/ilu.cpp @@ -4,16 +4,13 @@ #include "ginkgo/core/factorization/ilu.hpp" - #include - #include #include #include #include - #include "core/base/array_access.hpp" #include "core/config/config_helper.hpp" #include "core/factorization/factorization_kernels.hpp" diff --git a/core/factorization/ilu_kernels.hpp b/core/factorization/ilu_kernels.hpp index 562d6e0901d..2371c17fda4 100644 --- a/core/factorization/ilu_kernels.hpp +++ b/core/factorization/ilu_kernels.hpp @@ -6,17 +6,13 @@ #define GKO_CORE_FACTORIZATION_ILU_KERNELS_HPP_ -#include - - #include - #include #include +#include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/factorization/lu.cpp b/core/factorization/lu.cpp index 8ab1ddfc37f..fb9cab4154a 100644 --- a/core/factorization/lu.cpp +++ b/core/factorization/lu.cpp @@ -4,14 +4,12 @@ #include "ginkgo/core/factorization/lu.hpp" - #include #include #include #include #include - #include "core/base/array_access.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/config/config_helper.hpp" diff --git a/core/factorization/lu_kernels.hpp b/core/factorization/lu_kernels.hpp index 601f424087f..f497398cb90 100644 --- a/core/factorization/lu_kernels.hpp +++ b/core/factorization/lu_kernels.hpp @@ -8,12 +8,10 @@ #include - #include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/factorization/par_ic.cpp b/core/factorization/par_ic.cpp index c21f66934aa..f4a4afd23d6 100644 --- a/core/factorization/par_ic.cpp +++ b/core/factorization/par_ic.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/factorization/par_ic.hpp" - #include - #include #include #include @@ -18,7 +16,6 @@ #include #include - #include "core/base/array_access.hpp" #include "core/components/format_conversion_kernels.hpp" #include "core/config/config_helper.hpp" diff --git a/core/factorization/par_ic_kernels.hpp b/core/factorization/par_ic_kernels.hpp index 47cfc8c37a2..59d2d97ffce 100644 --- a/core/factorization/par_ic_kernels.hpp +++ b/core/factorization/par_ic_kernels.hpp @@ -6,18 +6,14 @@ #define GKO_CORE_FACTORIZATION_PAR_IC_KERNELS_HPP_ -#include - - #include - #include #include +#include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/factorization/par_ict.cpp b/core/factorization/par_ict.cpp index 54176d79545..a0e8a628ca8 100644 --- a/core/factorization/par_ict.cpp +++ b/core/factorization/par_ict.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/factorization/par_ict.hpp" - #include - #include #include #include @@ -17,7 +15,6 @@ #include #include - #include "core/base/array_access.hpp" #include "core/base/utils.hpp" #include "core/components/format_conversion_kernels.hpp" diff --git a/core/factorization/par_ict_kernels.hpp b/core/factorization/par_ict_kernels.hpp index 0b0c2c6bcd2..25172c0d649 100644 --- a/core/factorization/par_ict_kernels.hpp +++ b/core/factorization/par_ict_kernels.hpp @@ -6,18 +6,14 @@ #define GKO_CORE_FACTORIZATION_PAR_ICT_KERNELS_HPP_ -#include - - #include - #include #include +#include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/factorization/par_ilu.cpp b/core/factorization/par_ilu.cpp index f69947adcac..68c0c0c4fc6 100644 --- a/core/factorization/par_ilu.cpp +++ b/core/factorization/par_ilu.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/factorization/par_ilu.hpp" - #include - #include #include #include @@ -17,7 +15,6 @@ #include #include - #include "core/base/array_access.hpp" #include "core/config/config_helper.hpp" #include "core/factorization/factorization_kernels.hpp" diff --git a/core/factorization/par_ilu_kernels.hpp b/core/factorization/par_ilu_kernels.hpp index 943cd1cf9bc..16d20859c3e 100644 --- a/core/factorization/par_ilu_kernels.hpp +++ b/core/factorization/par_ilu_kernels.hpp @@ -6,17 +6,13 @@ #define GKO_CORE_FACTORIZATION_PAR_ILU_KERNELS_HPP_ -#include - - #include - #include #include +#include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/factorization/par_ilut.cpp b/core/factorization/par_ilut.cpp index ff4b5b2a83e..42e3cc03130 100644 --- a/core/factorization/par_ilut.cpp +++ b/core/factorization/par_ilut.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/factorization/par_ilut.hpp" - #include - #include #include #include @@ -17,7 +15,6 @@ #include #include - #include "core/base/array_access.hpp" #include "core/base/utils.hpp" #include "core/components/format_conversion_kernels.hpp" diff --git a/core/factorization/par_ilut_kernels.hpp b/core/factorization/par_ilut_kernels.hpp index 880acc7a4c2..2d8ac7b4f88 100644 --- a/core/factorization/par_ilut_kernels.hpp +++ b/core/factorization/par_ilut_kernels.hpp @@ -6,18 +6,14 @@ #define GKO_CORE_FACTORIZATION_PAR_ILUT_KERNELS_HPP_ -#include - - #include - #include #include +#include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/factorization/symbolic.cpp b/core/factorization/symbolic.cpp index c55bfb3e759..23f6b94cc14 100644 --- a/core/factorization/symbolic.cpp +++ b/core/factorization/symbolic.cpp @@ -4,14 +4,12 @@ #include "core/factorization/symbolic.hpp" - #include #include #include #include #include - #include "core/base/allocator.hpp" #include "core/base/array_access.hpp" #include "core/components/prefix_sum_kernels.hpp" diff --git a/core/factorization/symbolic.hpp b/core/factorization/symbolic.hpp index c98c623c92d..096d8c998bc 100644 --- a/core/factorization/symbolic.hpp +++ b/core/factorization/symbolic.hpp @@ -4,7 +4,6 @@ #include - #include "core/factorization/elimination_forest.hpp" diff --git a/core/log/batch_logger.cpp b/core/log/batch_logger.cpp index 532cae64c28..286803c0ae1 100644 --- a/core/log/batch_logger.cpp +++ b/core/log/batch_logger.cpp @@ -4,12 +4,10 @@ #include "ginkgo/core/log/batch_logger.hpp" - #include #include #include - #include "core/base/workspace_aliases.hpp" diff --git a/core/log/convergence.cpp b/core/log/convergence.cpp index 16c89e08ffc..7cfa764dfd1 100644 --- a/core/log/convergence.cpp +++ b/core/log/convergence.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/log/convergence.hpp" - #include #include #include @@ -12,7 +11,6 @@ #include #include - #include "core/base/dispatch_helper.hpp" #include "core/distributed/helpers.hpp" diff --git a/core/log/papi.cpp b/core/log/papi.cpp index 83a9bd3b93c..5ced377ca38 100644 --- a/core/log/papi.cpp +++ b/core/log/papi.cpp @@ -4,11 +4,9 @@ #include "ginkgo/core/log/papi.hpp" - #include #include - #include "core/distributed/helpers.hpp" diff --git a/core/log/performance_hint.cpp b/core/log/performance_hint.cpp index 3b0a720aa93..5f497c33fe0 100644 --- a/core/log/performance_hint.cpp +++ b/core/log/performance_hint.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/log/performance_hint.hpp" - #include #include diff --git a/core/log/profiler_hook.cpp b/core/log/profiler_hook.cpp index 87ea8f42d02..7cb4f807919 100644 --- a/core/log/profiler_hook.cpp +++ b/core/log/profiler_hook.cpp @@ -4,18 +4,15 @@ #include "ginkgo/core/log/profiler_hook.hpp" - #include #include #include - #include #include #include #include - #include "core/log/profiler_hook.hpp" diff --git a/core/log/profiler_hook_summary.cpp b/core/log/profiler_hook_summary.cpp index c7d2f3ea95b..02641bdfd29 100644 --- a/core/log/profiler_hook_summary.cpp +++ b/core/log/profiler_hook_summary.cpp @@ -8,7 +8,6 @@ #include #include - #include "core/log/profiler_hook.hpp" diff --git a/core/log/profiler_hook_summary_writer.cpp b/core/log/profiler_hook_summary_writer.cpp index 4139f5938c9..dd39687ffe4 100644 --- a/core/log/profiler_hook_summary_writer.cpp +++ b/core/log/profiler_hook_summary_writer.cpp @@ -5,7 +5,6 @@ #include #include - #include diff --git a/core/log/record.cpp b/core/log/record.cpp index 6d995cd348c..0d810c05fa0 100644 --- a/core/log/record.cpp +++ b/core/log/record.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/log/record.hpp" - #include #include #include diff --git a/core/log/stream.cpp b/core/log/stream.cpp index 033575c9b54..5e510d409e2 100644 --- a/core/log/stream.cpp +++ b/core/log/stream.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/log/stream.hpp" - #include - #include #include #include diff --git a/core/matrix/batch_csr.cpp b/core/matrix/batch_csr.cpp index 8e4b1434f8e..1b1dc22a6c4 100644 --- a/core/matrix/batch_csr.cpp +++ b/core/matrix/batch_csr.cpp @@ -4,11 +4,9 @@ #include "ginkgo/core/matrix/batch_csr.hpp" - #include #include - #include #include #include @@ -17,7 +15,6 @@ #include #include - #include "core/matrix/batch_csr_kernels.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/core/matrix/batch_csr_kernels.hpp b/core/matrix/batch_csr_kernels.hpp index d2066389bba..2ee4e2100a2 100644 --- a/core/matrix/batch_csr_kernels.hpp +++ b/core/matrix/batch_csr_kernels.hpp @@ -6,14 +6,11 @@ #define GKO_CORE_MATRIX_BATCH_CSR_KERNELS_HPP_ -#include - - #include #include #include #include - +#include #include "core/base/kernel_declaration.hpp" diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp index a2eb017cf7c..6390a4c7ad0 100644 --- a/core/matrix/batch_dense.cpp +++ b/core/matrix/batch_dense.cpp @@ -4,11 +4,9 @@ #include "ginkgo/core/matrix/batch_dense.hpp" - #include #include - #include #include #include @@ -18,7 +16,6 @@ #include #include - #include "core/matrix/batch_dense_kernels.hpp" diff --git a/core/matrix/batch_dense_kernels.hpp b/core/matrix/batch_dense_kernels.hpp index 6c5f4a02242..13b88f9f4b2 100644 --- a/core/matrix/batch_dense_kernels.hpp +++ b/core/matrix/batch_dense_kernels.hpp @@ -6,12 +6,9 @@ #define GKO_CORE_MATRIX_BATCH_DENSE_KERNELS_HPP_ -#include - - #include #include - +#include #include "core/base/kernel_declaration.hpp" diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp index 5c3da632643..3722c41de60 100644 --- a/core/matrix/batch_ell.cpp +++ b/core/matrix/batch_ell.cpp @@ -4,11 +4,9 @@ #include "ginkgo/core/matrix/batch_ell.hpp" - #include #include - #include #include #include @@ -17,7 +15,6 @@ #include #include - #include "core/matrix/batch_ell_kernels.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/core/matrix/batch_ell_kernels.hpp b/core/matrix/batch_ell_kernels.hpp index c5d7f8cb857..77707c89a8b 100644 --- a/core/matrix/batch_ell_kernels.hpp +++ b/core/matrix/batch_ell_kernels.hpp @@ -6,13 +6,10 @@ #define GKO_CORE_MATRIX_BATCH_ELL_KERNELS_HPP_ -#include - - #include #include #include - +#include #include "core/base/kernel_declaration.hpp" diff --git a/core/matrix/batch_identity.cpp b/core/matrix/batch_identity.cpp index 480f0a10474..2220120d00b 100644 --- a/core/matrix/batch_identity.cpp +++ b/core/matrix/batch_identity.cpp @@ -4,11 +4,9 @@ #include "ginkgo/core/matrix/batch_identity.hpp" - #include #include - #include #include #include diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index eb8b33c0cf1..1368dc261c3 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -4,11 +4,9 @@ #include "ginkgo/core/matrix/coo.hpp" - #include #include - #include #include #include @@ -18,7 +16,6 @@ #include #include - #include "core/base/device_matrix_data_kernels.hpp" #include "core/components/absolute_array_kernels.hpp" #include "core/components/fill_array_kernels.hpp" diff --git a/core/matrix/coo_kernels.hpp b/core/matrix/coo_kernels.hpp index eece195ab74..a2cc44b74d9 100644 --- a/core/matrix/coo_kernels.hpp +++ b/core/matrix/coo_kernels.hpp @@ -6,15 +6,12 @@ #define GKO_CORE_MATRIX_COO_KERNELS_HPP_ -#include - - #include +#include #include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index 8dad86568fb..e50732a3be9 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/matrix/csr.hpp" - #include #include #include @@ -22,7 +21,6 @@ #include #include - #include "core/base/array_access.hpp" #include "core/base/device_matrix_data_kernels.hpp" #include "core/components/absolute_array_kernels.hpp" diff --git a/core/matrix/csr_accessor_helper.hpp b/core/matrix/csr_accessor_helper.hpp index bfbca1d5cb5..2187439920e 100644 --- a/core/matrix/csr_accessor_helper.hpp +++ b/core/matrix/csr_accessor_helper.hpp @@ -9,7 +9,6 @@ #include #include - #include "accessor/index_span.hpp" #include "accessor/reduced_row_major.hpp" #include "accessor/utils.hpp" diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp index 0cebe435e4f..6013e014c8a 100644 --- a/core/matrix/csr_kernels.hpp +++ b/core/matrix/csr_kernels.hpp @@ -6,13 +6,11 @@ #define GKO_CORE_MATRIX_CSR_KERNELS_HPP_ -#include - - #include #include #include #include +#include #include #include #include @@ -20,7 +18,6 @@ #include #include - #include "core/base/kernel_declaration.hpp" #include "core/matrix/csr_lookup.hpp" diff --git a/core/matrix/csr_lookup.hpp b/core/matrix/csr_lookup.hpp index 6de3265ff21..a7b687c3618 100644 --- a/core/matrix/csr_lookup.hpp +++ b/core/matrix/csr_lookup.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index eb52c574db9..171ff007b4a 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -4,11 +4,9 @@ #include "ginkgo/core/matrix/dense.hpp" - #include #include - #include #include #include @@ -28,7 +26,6 @@ #include #include - #include "core/base/array_access.hpp" #include "core/base/dispatch_helper.hpp" #include "core/components/prefix_sum_kernels.hpp" diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp index 63999b12f82..7422b431aa0 100644 --- a/core/matrix/dense_kernels.hpp +++ b/core/matrix/dense_kernels.hpp @@ -6,17 +6,13 @@ #define GKO_CORE_MATRIX_DENSE_KERNELS_HPP_ -#include - - #include - #include #include +#include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp index 08b1e00e340..1a442ffc789 100644 --- a/core/matrix/diagonal.cpp +++ b/core/matrix/diagonal.cpp @@ -4,13 +4,11 @@ #include "ginkgo/core/matrix/diagonal.hpp" - #include #include #include #include - #include "core/components/absolute_array_kernels.hpp" #include "core/matrix/diagonal_kernels.hpp" diff --git a/core/matrix/diagonal_kernels.hpp b/core/matrix/diagonal_kernels.hpp index 930144491f4..630c76e43ad 100644 --- a/core/matrix/diagonal_kernels.hpp +++ b/core/matrix/diagonal_kernels.hpp @@ -6,13 +6,10 @@ #define GKO_CORE_MATRIX_DIAGONAL_KERNELS_HPP_ -#include - - #include #include #include - +#include #include "core/base/kernel_declaration.hpp" diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index f6433fe156a..87b74c7f417 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/matrix/ell.hpp" - #include - #include #include #include @@ -17,7 +15,6 @@ #include #include - #include "core/base/allocator.hpp" #include "core/base/array_access.hpp" #include "core/base/device_matrix_data_kernels.hpp" diff --git a/core/matrix/ell_kernels.hpp b/core/matrix/ell_kernels.hpp index f31164702d9..7b88507d650 100644 --- a/core/matrix/ell_kernels.hpp +++ b/core/matrix/ell_kernels.hpp @@ -6,13 +6,10 @@ #define GKO_CORE_MATRIX_ELL_KERNELS_HPP_ -#include - - #include #include #include - +#include #include "core/base/kernel_declaration.hpp" diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index 1ea00d741bd..a48e32be088 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -4,11 +4,9 @@ #include "ginkgo/core/matrix/fbcsr.hpp" - #include #include - #include #include #include @@ -22,7 +20,6 @@ #include #include - #include "accessor/block_col_major.hpp" #include "accessor/range.hpp" #include "core/components/absolute_array_kernels.hpp" diff --git a/core/matrix/fbcsr_kernels.hpp b/core/matrix/fbcsr_kernels.hpp index c5f1ee6fb03..7a644d48d78 100644 --- a/core/matrix/fbcsr_kernels.hpp +++ b/core/matrix/fbcsr_kernels.hpp @@ -6,17 +6,14 @@ #define GKO_CORE_MATRIX_FBCSR_KERNELS_HPP_ -#include - - #include #include #include #include #include +#include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/matrix/fft.cpp b/core/matrix/fft.cpp index 1ec69ce3338..cd6f20c1edc 100644 --- a/core/matrix/fft.cpp +++ b/core/matrix/fft.cpp @@ -4,12 +4,10 @@ #include "ginkgo/core/matrix/fft.hpp" - #include #include #include - #include "core/matrix/fft_kernels.hpp" diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp index c30c60ce0fb..d450a0dfc35 100644 --- a/core/matrix/hybrid.cpp +++ b/core/matrix/hybrid.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/matrix/hybrid.hpp" - #include - #include #include #include @@ -16,7 +14,6 @@ #include #include - #include "core/base/array_access.hpp" #include "core/base/device_matrix_data_kernels.hpp" #include "core/components/absolute_array_kernels.hpp" diff --git a/core/matrix/hybrid_kernels.hpp b/core/matrix/hybrid_kernels.hpp index 9460a521c90..85ff74bfab5 100644 --- a/core/matrix/hybrid_kernels.hpp +++ b/core/matrix/hybrid_kernels.hpp @@ -6,11 +6,8 @@ #define GKO_CORE_MATRIX_HYBRID_KERNELS_HPP_ -#include - - #include - +#include #include "core/base/kernel_declaration.hpp" diff --git a/core/matrix/identity.cpp b/core/matrix/identity.cpp index a58601f31f0..7e035be82a3 100644 --- a/core/matrix/identity.cpp +++ b/core/matrix/identity.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/matrix/identity.hpp" - #include #include #include diff --git a/core/matrix/permutation.cpp b/core/matrix/permutation.cpp index 76f5d7c8005..0fe7ba2b2ce 100644 --- a/core/matrix/permutation.cpp +++ b/core/matrix/permutation.cpp @@ -4,14 +4,12 @@ #include "ginkgo/core/matrix/permutation.hpp" - #include #include #include #include #include - #include "core/base/dispatch_helper.hpp" #include "core/matrix/permutation_kernels.hpp" diff --git a/core/matrix/permutation.hpp b/core/matrix/permutation.hpp index 6e674f3db79..6ae375d63cf 100644 --- a/core/matrix/permutation.hpp +++ b/core/matrix/permutation.hpp @@ -6,10 +6,8 @@ #define GKO_CORE_MATRIX_PERMUTATION_HPP_ -#include - - #include +#include namespace gko { diff --git a/core/matrix/permutation_kernels.hpp b/core/matrix/permutation_kernels.hpp index 65b207f5fdf..9a1d269d610 100644 --- a/core/matrix/permutation_kernels.hpp +++ b/core/matrix/permutation_kernels.hpp @@ -18,7 +18,6 @@ #include #include - #include "core/base/kernel_declaration.hpp" #include "core/matrix/csr_lookup.hpp" diff --git a/core/matrix/row_gatherer.cpp b/core/matrix/row_gatherer.cpp index 72a6cbe2808..fecc60a0ca9 100644 --- a/core/matrix/row_gatherer.cpp +++ b/core/matrix/row_gatherer.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/matrix/row_gatherer.hpp" - #include - #include "core/base/dispatch_helper.hpp" diff --git a/core/matrix/scaled_permutation.cpp b/core/matrix/scaled_permutation.cpp index c948c6071ad..0f295d6b5be 100644 --- a/core/matrix/scaled_permutation.cpp +++ b/core/matrix/scaled_permutation.cpp @@ -4,12 +4,10 @@ #include "ginkgo/core/matrix/scaled_permutation.hpp" - #include #include #include - #include "core/matrix/scaled_permutation_kernels.hpp" diff --git a/core/matrix/scaled_permutation_kernels.hpp b/core/matrix/scaled_permutation_kernels.hpp index 1cc664002a3..696c53a387f 100644 --- a/core/matrix/scaled_permutation_kernels.hpp +++ b/core/matrix/scaled_permutation_kernels.hpp @@ -8,7 +8,6 @@ #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp index 39e2c706b19..a4787e758bf 100644 --- a/core/matrix/sellp.cpp +++ b/core/matrix/sellp.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/matrix/sellp.hpp" - #include #include #include @@ -13,7 +12,6 @@ #include #include - #include "core/base/allocator.hpp" #include "core/base/array_access.hpp" #include "core/base/device_matrix_data_kernels.hpp" diff --git a/core/matrix/sellp_kernels.hpp b/core/matrix/sellp_kernels.hpp index ce5cea36a84..fb4706039fb 100644 --- a/core/matrix/sellp_kernels.hpp +++ b/core/matrix/sellp_kernels.hpp @@ -6,13 +6,10 @@ #define GKO_CORE_MATRIX_SELLP_KERNELS_HPP_ -#include - - #include #include #include - +#include #include "core/base/kernel_declaration.hpp" diff --git a/core/matrix/sparsity_csr.cpp b/core/matrix/sparsity_csr.cpp index 2ec463613b0..9b8ea04da52 100644 --- a/core/matrix/sparsity_csr.cpp +++ b/core/matrix/sparsity_csr.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/matrix/sparsity_csr.hpp" - #include #include #include @@ -13,7 +12,6 @@ #include #include - #include "core/base/array_access.hpp" #include "core/base/device_matrix_data_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" diff --git a/core/matrix/sparsity_csr_kernels.hpp b/core/matrix/sparsity_csr_kernels.hpp index 869f43e1a7c..e07bb980dce 100644 --- a/core/matrix/sparsity_csr_kernels.hpp +++ b/core/matrix/sparsity_csr_kernels.hpp @@ -6,12 +6,9 @@ #define GKO_CORE_MATRIX_SPARSITY_CSR_KERNELS_HPP_ -#include - - #include #include - +#include #include "core/base/kernel_declaration.hpp" diff --git a/core/mpi/exception.cpp b/core/mpi/exception.cpp index 8ffd6b0f133..d4d66012c14 100644 --- a/core/mpi/exception.cpp +++ b/core/mpi/exception.cpp @@ -5,10 +5,8 @@ #include #include - #include - #include diff --git a/core/multigrid/fixed_coarsening.cpp b/core/multigrid/fixed_coarsening.cpp index e7024d334ad..1cbdd557fb4 100644 --- a/core/multigrid/fixed_coarsening.cpp +++ b/core/multigrid/fixed_coarsening.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/multigrid/fixed_coarsening.hpp" - #include #include #include @@ -17,7 +16,6 @@ #include #include - #include "core/base/utils.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/matrix/csr_builder.hpp" diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp index f0393794d94..9f1f5b50ba6 100644 --- a/core/multigrid/pgm.cpp +++ b/core/multigrid/pgm.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/multigrid/pgm.hpp" - #include #include #include @@ -22,7 +21,6 @@ #include #include - #include "core/base/dispatch_helper.hpp" #include "core/base/iterator_factory.hpp" #include "core/base/utils.hpp" diff --git a/core/multigrid/pgm_kernels.hpp b/core/multigrid/pgm_kernels.hpp index 4118507ac7d..a7a0a4aa099 100644 --- a/core/multigrid/pgm_kernels.hpp +++ b/core/multigrid/pgm_kernels.hpp @@ -8,14 +8,12 @@ #include - #include #include #include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/preconditioner/batch_jacobi.cpp b/core/preconditioner/batch_jacobi.cpp index 3f18a32123f..f92ccd18cfc 100644 --- a/core/preconditioner/batch_jacobi.cpp +++ b/core/preconditioner/batch_jacobi.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/preconditioner/batch_jacobi.hpp" - #include "core/matrix/batch_csr_kernels.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/preconditioner/batch_jacobi_kernels.hpp" diff --git a/core/preconditioner/batch_jacobi_kernels.hpp b/core/preconditioner/batch_jacobi_kernels.hpp index c37db81d72f..784ab2d1f6e 100644 --- a/core/preconditioner/batch_jacobi_kernels.hpp +++ b/core/preconditioner/batch_jacobi_kernels.hpp @@ -6,14 +6,11 @@ #define GKO_CORE_PRECONDITIONER_BATCH_JACOBI_KERNELS_HPP_ -#include - - #include #include #include #include - +#include #include "core/base/kernel_declaration.hpp" #include "core/preconditioner/batch_jacobi_helpers.hpp" diff --git a/core/preconditioner/ic.cpp b/core/preconditioner/ic.cpp index 37eb0cb5b3f..691795ad60b 100644 --- a/core/preconditioner/ic.cpp +++ b/core/preconditioner/ic.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/preconditioner/ic.hpp" - #include #include #include @@ -13,7 +12,6 @@ #include #include - #include "core/config/config_helper.hpp" #include "core/config/dispatch.hpp" diff --git a/core/preconditioner/ilu.cpp b/core/preconditioner/ilu.cpp index 00422300172..d6f49e49588 100644 --- a/core/preconditioner/ilu.cpp +++ b/core/preconditioner/ilu.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/preconditioner/ilu.hpp" - #include #include #include @@ -14,7 +13,6 @@ #include #include - #include "core/config/config_helper.hpp" #include "core/config/dispatch.hpp" diff --git a/core/preconditioner/isai.cpp b/core/preconditioner/isai.cpp index f825e2f5c82..9684f1bdb27 100644 --- a/core/preconditioner/isai.cpp +++ b/core/preconditioner/isai.cpp @@ -4,12 +4,10 @@ #include "ginkgo/core/preconditioner/isai.hpp" - #include #include #include - #include #include #include @@ -22,7 +20,6 @@ #include #include - #include "core/base/array_access.hpp" #include "core/base/utils.hpp" #include "core/config/config_helper.hpp" diff --git a/core/preconditioner/isai_kernels.hpp b/core/preconditioner/isai_kernels.hpp index 9c3f89d7b5e..d1897251916 100644 --- a/core/preconditioner/isai_kernels.hpp +++ b/core/preconditioner/isai_kernels.hpp @@ -6,11 +6,8 @@ #define GKO_CORE_PRECONDITIONER_ISAI_KERNELS_HPP_ -#include - - #include - +#include #include "core/base/kernel_declaration.hpp" diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp index 8081f31712a..f6d5b042a23 100644 --- a/core/preconditioner/jacobi.cpp +++ b/core/preconditioner/jacobi.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/preconditioner/jacobi.hpp" - #include - #include #include #include @@ -19,7 +17,6 @@ #include #include - #include "core/base/extended_float.hpp" #include "core/base/utils.hpp" #include "core/config/config_helper.hpp" diff --git a/core/preconditioner/jacobi_kernels.hpp b/core/preconditioner/jacobi_kernels.hpp index ee5227a6c0b..e29791e0a6e 100644 --- a/core/preconditioner/jacobi_kernels.hpp +++ b/core/preconditioner/jacobi_kernels.hpp @@ -6,11 +6,8 @@ #define GKO_CORE_PRECONDITIONER_JACOBI_KERNELS_HPP_ -#include - - #include - +#include #include "core/base/kernel_declaration.hpp" diff --git a/core/preconditioner/jacobi_utils.hpp b/core/preconditioner/jacobi_utils.hpp index 8fb80f0ad94..e159fd15776 100644 --- a/core/preconditioner/jacobi_utils.hpp +++ b/core/preconditioner/jacobi_utils.hpp @@ -9,7 +9,6 @@ #include #include - #include "core/base/extended_float.hpp" diff --git a/core/reorder/amd.cpp b/core/reorder/amd.cpp index 7cb24c39ea0..3828bce9197 100644 --- a/core/reorder/amd.cpp +++ b/core/reorder/amd.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/reorder/amd.hpp" - #include - #include #include #include @@ -15,7 +13,6 @@ #include #include - #include "core/base/allocator.hpp" diff --git a/core/reorder/mc64.cpp b/core/reorder/mc64.cpp index e47969c0b71..97dd37b90fc 100644 --- a/core/reorder/mc64.cpp +++ b/core/reorder/mc64.cpp @@ -4,11 +4,9 @@ #include "ginkgo/core/reorder/mc64.hpp" - #include #include - #include #include #include @@ -20,7 +18,6 @@ #include #include - #include "core/components/addressable_pq.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/core/reorder/mc64.hpp b/core/reorder/mc64.hpp index e6b34614f3c..97456f93464 100644 --- a/core/reorder/mc64.hpp +++ b/core/reorder/mc64.hpp @@ -6,12 +6,9 @@ #define GKO_CORE_REORDER_MC64_HPP_ -#include - - #include #include - +#include #include "core/components/addressable_pq.hpp" diff --git a/core/reorder/nested_dissection.cpp b/core/reorder/nested_dissection.cpp index bf9c8ba7a3d..e14af9ffbfc 100644 --- a/core/reorder/nested_dissection.cpp +++ b/core/reorder/nested_dissection.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/reorder/nested_dissection.hpp" - #include - #include #include #include diff --git a/core/reorder/rcm.cpp b/core/reorder/rcm.cpp index f3a16cc92a6..1acf4d97f1f 100644 --- a/core/reorder/rcm.cpp +++ b/core/reorder/rcm.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/reorder/rcm.hpp" - #include - #include #include #include @@ -18,7 +16,6 @@ #include #include - #include "core/matrix/csr_kernels.hpp" #include "core/reorder/rcm_kernels.hpp" diff --git a/core/reorder/rcm_kernels.hpp b/core/reorder/rcm_kernels.hpp index 77e1ce68ff0..a89b2732cb0 100644 --- a/core/reorder/rcm_kernels.hpp +++ b/core/reorder/rcm_kernels.hpp @@ -6,19 +6,15 @@ #define GKO_CORE_REORDER_RCM_KERNELS_HPP_ -#include - - #include - #include #include #include #include #include #include - +#include #include "core/base/kernel_declaration.hpp" diff --git a/core/reorder/scaled_reordered.cpp b/core/reorder/scaled_reordered.cpp index cf246ea3194..264122c0b8f 100644 --- a/core/reorder/scaled_reordered.cpp +++ b/core/reorder/scaled_reordered.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/reorder/scaled_reordered.hpp" - #include - #include #include diff --git a/core/solver/batch_bicgstab.cpp b/core/solver/batch_bicgstab.cpp index 9621f058097..c22c712b411 100644 --- a/core/solver/batch_bicgstab.cpp +++ b/core/solver/batch_bicgstab.cpp @@ -4,12 +4,10 @@ #include "ginkgo/core/solver/batch_bicgstab.hpp" - #include #include #include - #include "core/base/batch_multi_vector_kernels.hpp" #include "core/solver/batch_bicgstab_kernels.hpp" diff --git a/core/solver/batch_bicgstab_kernels.hpp b/core/solver/batch_bicgstab_kernels.hpp index 43f55f1356d..1eed30aba5a 100644 --- a/core/solver/batch_bicgstab_kernels.hpp +++ b/core/solver/batch_bicgstab_kernels.hpp @@ -12,7 +12,6 @@ #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/solver/batch_cg.cpp b/core/solver/batch_cg.cpp index d2fe4a5f00d..0ab1ca8564f 100644 --- a/core/solver/batch_cg.cpp +++ b/core/solver/batch_cg.cpp @@ -4,12 +4,10 @@ #include "ginkgo/core/solver/batch_cg.hpp" - #include #include #include - #include "core/base/batch_multi_vector_kernels.hpp" #include "core/solver/batch_cg_kernels.hpp" diff --git a/core/solver/batch_cg_kernels.hpp b/core/solver/batch_cg_kernels.hpp index d2c64460be2..6fdb595862e 100644 --- a/core/solver/batch_cg_kernels.hpp +++ b/core/solver/batch_cg_kernels.hpp @@ -12,7 +12,6 @@ #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp index 1cafda169c2..8a142a5224a 100644 --- a/core/solver/batch_dispatch.hpp +++ b/core/solver/batch_dispatch.hpp @@ -17,7 +17,6 @@ #include #include - #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp index 51ba251aecd..c379cb8df08 100644 --- a/core/solver/bicg.cpp +++ b/core/solver/bicg.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/solver/bicg.hpp" - #include #include #include @@ -12,7 +11,6 @@ #include #include - #include "core/config/solver_config.hpp" #include "core/solver/bicg_kernels.hpp" #include "core/solver/solver_boilerplate.hpp" diff --git a/core/solver/bicg_kernels.hpp b/core/solver/bicg_kernels.hpp index 712df21e90c..5e94d8ca350 100644 --- a/core/solver/bicg_kernels.hpp +++ b/core/solver/bicg_kernels.hpp @@ -8,14 +8,12 @@ #include - #include #include #include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/solver/bicgstab.cpp b/core/solver/bicgstab.cpp index e1f2f1cb77e..c254b417765 100644 --- a/core/solver/bicgstab.cpp +++ b/core/solver/bicgstab.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/solver/bicgstab.hpp" - #include #include #include @@ -13,7 +12,6 @@ #include #include - #include "core/config/solver_config.hpp" #include "core/distributed/helpers.hpp" #include "core/solver/bicgstab_kernels.hpp" diff --git a/core/solver/bicgstab_kernels.hpp b/core/solver/bicgstab_kernels.hpp index 8160381d4f3..e3bfbdcdcb6 100644 --- a/core/solver/bicgstab_kernels.hpp +++ b/core/solver/bicgstab_kernels.hpp @@ -8,14 +8,12 @@ #include - #include #include #include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/solver/cb_gmres.cpp b/core/solver/cb_gmres.cpp index 812c6c222ce..274948531ab 100644 --- a/core/solver/cb_gmres.cpp +++ b/core/solver/cb_gmres.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/solver/cb_gmres.hpp" - #include - #include #include #include @@ -18,7 +16,6 @@ #include #include - #include "core/base/extended_float.hpp" #include "core/config/solver_config.hpp" #include "core/solver/cb_gmres_accessor.hpp" diff --git a/core/solver/cb_gmres_accessor.hpp b/core/solver/cb_gmres_accessor.hpp index e216171a6f5..64a7c9a46e5 100644 --- a/core/solver/cb_gmres_accessor.hpp +++ b/core/solver/cb_gmres_accessor.hpp @@ -12,7 +12,6 @@ #include #include - #include #include #include @@ -20,7 +19,6 @@ #include #include - #include "accessor/range.hpp" #include "accessor/reduced_row_major.hpp" #include "accessor/scaled_reduced_row_major.hpp" diff --git a/core/solver/cb_gmres_kernels.hpp b/core/solver/cb_gmres_kernels.hpp index 3e5d8c89f25..29a84f25ba1 100644 --- a/core/solver/cb_gmres_kernels.hpp +++ b/core/solver/cb_gmres_kernels.hpp @@ -12,7 +12,6 @@ #include #include - #include "accessor/reduced_row_major.hpp" #include "accessor/scaled_reduced_row_major.hpp" #include "core/base/extended_float.hpp" diff --git a/core/solver/cg.cpp b/core/solver/cg.cpp index a8e534588a0..20487b4cd0d 100644 --- a/core/solver/cg.cpp +++ b/core/solver/cg.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/solver/cg.hpp" - #include #include #include @@ -13,7 +12,6 @@ #include #include - #include "core/config/solver_config.hpp" #include "core/distributed/helpers.hpp" #include "core/solver/cg_kernels.hpp" diff --git a/core/solver/cg_kernels.hpp b/core/solver/cg_kernels.hpp index 127126317d7..bec5f04d0e5 100644 --- a/core/solver/cg_kernels.hpp +++ b/core/solver/cg_kernels.hpp @@ -8,14 +8,12 @@ #include - #include #include #include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/solver/cgs.cpp b/core/solver/cgs.cpp index 9d6a575fdbf..19f625228a3 100644 --- a/core/solver/cgs.cpp +++ b/core/solver/cgs.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/solver/cgs.hpp" - #include #include #include @@ -13,7 +12,6 @@ #include #include - #include "core/config/solver_config.hpp" #include "core/distributed/helpers.hpp" #include "core/solver/cgs_kernels.hpp" diff --git a/core/solver/cgs_kernels.hpp b/core/solver/cgs_kernels.hpp index 5d64a7a0ed1..d64aeedb549 100644 --- a/core/solver/cgs_kernels.hpp +++ b/core/solver/cgs_kernels.hpp @@ -8,14 +8,12 @@ #include - #include #include #include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/solver/common_gmres_kernels.hpp b/core/solver/common_gmres_kernels.hpp index 9174cc2c4e4..0209284c446 100644 --- a/core/solver/common_gmres_kernels.hpp +++ b/core/solver/common_gmres_kernels.hpp @@ -12,7 +12,6 @@ #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/solver/direct.cpp b/core/solver/direct.cpp index 717fd71698f..c999fdea4fc 100644 --- a/core/solver/direct.cpp +++ b/core/solver/direct.cpp @@ -4,15 +4,12 @@ #include "ginkgo/core/solver/direct.hpp" - #include - #include #include #include - #include "core/config/config_helper.hpp" diff --git a/core/solver/fcg.cpp b/core/solver/fcg.cpp index dee37467c46..c4f79854c0a 100644 --- a/core/solver/fcg.cpp +++ b/core/solver/fcg.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/solver/fcg.hpp" - #include #include #include @@ -12,7 +11,6 @@ #include #include - #include "core/config/solver_config.hpp" #include "core/distributed/helpers.hpp" #include "core/solver/fcg_kernels.hpp" diff --git a/core/solver/fcg_kernels.hpp b/core/solver/fcg_kernels.hpp index 4eda3e631c8..bb646055906 100644 --- a/core/solver/fcg_kernels.hpp +++ b/core/solver/fcg_kernels.hpp @@ -8,13 +8,11 @@ #include - #include #include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/solver/gcr.cpp b/core/solver/gcr.cpp index cb2b55a3460..d5131632dc3 100644 --- a/core/solver/gcr.cpp +++ b/core/solver/gcr.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/solver/gcr.hpp" - #include #include #include @@ -15,7 +14,6 @@ #include #include - #include "core/config/solver_config.hpp" #include "core/distributed/helpers.hpp" #include "core/solver/gcr_kernels.hpp" diff --git a/core/solver/gcr_kernels.hpp b/core/solver/gcr_kernels.hpp index 6d4e827b4f9..d33f31db571 100644 --- a/core/solver/gcr_kernels.hpp +++ b/core/solver/gcr_kernels.hpp @@ -12,7 +12,6 @@ #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index b0ad6baa01e..cd3d88a5c02 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/solver/gmres.hpp" - #include #include #include @@ -16,7 +15,6 @@ #include #include - #include "core/config/solver_config.hpp" #include "core/distributed/helpers.hpp" #include "core/solver/common_gmres_kernels.hpp" diff --git a/core/solver/gmres_kernels.hpp b/core/solver/gmres_kernels.hpp index 3cc5d457edb..196b0de3ab0 100644 --- a/core/solver/gmres_kernels.hpp +++ b/core/solver/gmres_kernels.hpp @@ -12,7 +12,6 @@ #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp index 4bc56562d3b..c6d89b84ea6 100644 --- a/core/solver/idr.cpp +++ b/core/solver/idr.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/solver/idr.hpp" - #include #include #include @@ -12,7 +11,6 @@ #include #include - #include "core/config/solver_config.hpp" #include "core/distributed/helpers.hpp" #include "core/solver/idr_kernels.hpp" diff --git a/core/solver/idr_kernels.hpp b/core/solver/idr_kernels.hpp index 02d9fa88511..3d579bd01af 100644 --- a/core/solver/idr_kernels.hpp +++ b/core/solver/idr_kernels.hpp @@ -12,7 +12,6 @@ #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/solver/ir.cpp b/core/solver/ir.cpp index 3a6b0b1d2d0..75efac351f9 100644 --- a/core/solver/ir.cpp +++ b/core/solver/ir.cpp @@ -4,12 +4,10 @@ #include "ginkgo/core/solver/ir.hpp" - #include #include #include - #include "core/config/config_helper.hpp" #include "core/distributed/helpers.hpp" #include "core/solver/ir_kernels.hpp" diff --git a/core/solver/ir_kernels.hpp b/core/solver/ir_kernels.hpp index f6d0f79242d..a411c9f375d 100644 --- a/core/solver/ir_kernels.hpp +++ b/core/solver/ir_kernels.hpp @@ -8,13 +8,11 @@ #include - #include #include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/solver/lower_trs.cpp b/core/solver/lower_trs.cpp index e36ec98f8fb..e8230625ab3 100644 --- a/core/solver/lower_trs.cpp +++ b/core/solver/lower_trs.cpp @@ -13,7 +13,6 @@ #include #include - #include "core/config/config_helper.hpp" #include "core/config/trisolver_config.hpp" #include "core/solver/lower_trs_kernels.hpp" diff --git a/core/solver/lower_trs_kernels.hpp b/core/solver/lower_trs_kernels.hpp index ce13f4b4f14..8bb0031e801 100644 --- a/core/solver/lower_trs_kernels.hpp +++ b/core/solver/lower_trs_kernels.hpp @@ -8,13 +8,11 @@ #include - #include #include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index d7fc1d3c997..6a8b5ee151b 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/solver/multigrid.hpp" - #include - #include #include #include @@ -27,7 +25,6 @@ #include #include - #include "core/base/dispatch_helper.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/config/config_helper.hpp" diff --git a/core/solver/multigrid_kernels.hpp b/core/solver/multigrid_kernels.hpp index 2e123cdbfec..73c660cbefb 100644 --- a/core/solver/multigrid_kernels.hpp +++ b/core/solver/multigrid_kernels.hpp @@ -12,7 +12,6 @@ #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/solver/upper_trs.cpp b/core/solver/upper_trs.cpp index 5a854bddf1e..be6fcc71275 100644 --- a/core/solver/upper_trs.cpp +++ b/core/solver/upper_trs.cpp @@ -13,7 +13,6 @@ #include #include - #include "core/config/config_helper.hpp" #include "core/config/trisolver_config.hpp" #include "core/solver/upper_trs_kernels.hpp" diff --git a/core/solver/upper_trs_kernels.hpp b/core/solver/upper_trs_kernels.hpp index e002b68f92d..d409aa7db09 100644 --- a/core/solver/upper_trs_kernels.hpp +++ b/core/solver/upper_trs_kernels.hpp @@ -8,13 +8,11 @@ #include - #include #include #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/stop/criterion.cpp b/core/stop/criterion.cpp index 02f04876f9f..1684e4bdeab 100644 --- a/core/stop/criterion.cpp +++ b/core/stop/criterion.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/stop/criterion.hpp" - #include "core/stop/criterion_kernels.hpp" diff --git a/core/stop/criterion_kernels.hpp b/core/stop/criterion_kernels.hpp index 242a2ee7f52..62e4135ee37 100644 --- a/core/stop/criterion_kernels.hpp +++ b/core/stop/criterion_kernels.hpp @@ -10,7 +10,6 @@ #include #include - #include "core/base/kernel_declaration.hpp" diff --git a/core/stop/residual_norm.cpp b/core/stop/residual_norm.cpp index 824ab87ec0f..adf7da3e2e6 100644 --- a/core/stop/residual_norm.cpp +++ b/core/stop/residual_norm.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/stop/residual_norm.hpp" - #include - #include "core/base/dispatch_helper.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/distributed/helpers.hpp" diff --git a/core/synthesizer/implementation_selection.hpp b/core/synthesizer/implementation_selection.hpp index 5e8796bb6b4..d34949ca3e3 100644 --- a/core/synthesizer/implementation_selection.hpp +++ b/core/synthesizer/implementation_selection.hpp @@ -8,7 +8,6 @@ #include - #include #include diff --git a/core/test/accessor/block_col_major.cpp b/core/test/accessor/block_col_major.cpp index 327343f8eb1..2b214416220 100644 --- a/core/test/accessor/block_col_major.cpp +++ b/core/test/accessor/block_col_major.cpp @@ -2,15 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "accessor/block_col_major.hpp" + #include #include #include - #include - -#include "accessor/block_col_major.hpp" #include "accessor/index_span.hpp" #include "accessor/range.hpp" diff --git a/core/test/accessor/index_span.cpp b/core/test/accessor/index_span.cpp index 37cb1f36612..368b1d1c3ae 100644 --- a/core/test/accessor/index_span.cpp +++ b/core/test/accessor/index_span.cpp @@ -2,11 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include "accessor/index_span.hpp" +#include + namespace { diff --git a/core/test/accessor/math.cpp b/core/test/accessor/math.cpp index f15644be93f..32bacac9043 100644 --- a/core/test/accessor/math.cpp +++ b/core/test/accessor/math.cpp @@ -2,16 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "accessor/math.hpp" + #include #include - #include -#include "accessor/math.hpp" - - namespace { diff --git a/core/test/accessor/range.cpp b/core/test/accessor/range.cpp index a4fe79ec68e..d03f0ae6d1d 100644 --- a/core/test/accessor/range.cpp +++ b/core/test/accessor/range.cpp @@ -2,16 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "accessor/range.hpp" + #include #include - #include -#include "accessor/range.hpp" - - namespace { diff --git a/core/test/accessor/reduced_row_major.cpp b/core/test/accessor/reduced_row_major.cpp index 636ba7a6d17..a270e90fb2a 100644 --- a/core/test/accessor/reduced_row_major.cpp +++ b/core/test/accessor/reduced_row_major.cpp @@ -2,19 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "accessor/reduced_row_major.hpp" + #include #include #include #include #include - #include - #include "accessor/index_span.hpp" #include "accessor/range.hpp" -#include "accessor/reduced_row_major.hpp" #include "accessor/utils.hpp" diff --git a/core/test/accessor/reduced_row_major_ginkgo.cpp b/core/test/accessor/reduced_row_major_ginkgo.cpp index 1b484ce3e8c..7acad0b9638 100644 --- a/core/test/accessor/reduced_row_major_ginkgo.cpp +++ b/core/test/accessor/reduced_row_major_ginkgo.cpp @@ -8,10 +8,8 @@ #include #include - #include - #include "accessor/index_span.hpp" #include "accessor/range.hpp" #include "accessor/reduced_row_major.hpp" diff --git a/core/test/accessor/reduced_row_major_reference.cpp b/core/test/accessor/reduced_row_major_reference.cpp index a6da6277b1d..58f249c5275 100644 --- a/core/test/accessor/reduced_row_major_reference.cpp +++ b/core/test/accessor/reduced_row_major_reference.cpp @@ -2,18 +2,17 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "accessor/reduced_row_major_reference.hpp" + #include #include #include #include #include - #include - #include "accessor/math.hpp" -#include "accessor/reduced_row_major_reference.hpp" #include "accessor/utils.hpp" diff --git a/core/test/accessor/row_major.cpp b/core/test/accessor/row_major.cpp index 4902c29b9af..68f4e295006 100644 --- a/core/test/accessor/row_major.cpp +++ b/core/test/accessor/row_major.cpp @@ -2,17 +2,16 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "accessor/row_major.hpp" + #include #include #include - #include - #include "accessor/index_span.hpp" #include "accessor/range.hpp" -#include "accessor/row_major.hpp" namespace { diff --git a/core/test/accessor/scaled_reduced_row_major.cpp b/core/test/accessor/scaled_reduced_row_major.cpp index eacd196985d..5a7b2bb2fe1 100644 --- a/core/test/accessor/scaled_reduced_row_major.cpp +++ b/core/test/accessor/scaled_reduced_row_major.cpp @@ -2,19 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "accessor/scaled_reduced_row_major.hpp" + #include #include #include #include #include - #include - #include "accessor/index_span.hpp" #include "accessor/range.hpp" -#include "accessor/scaled_reduced_row_major.hpp" namespace { diff --git a/core/test/accessor/scaled_reduced_row_major_reference.cpp b/core/test/accessor/scaled_reduced_row_major_reference.cpp index 281ae9a6735..a91fe81b428 100644 --- a/core/test/accessor/scaled_reduced_row_major_reference.cpp +++ b/core/test/accessor/scaled_reduced_row_major_reference.cpp @@ -2,18 +2,17 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "accessor/scaled_reduced_row_major_reference.hpp" + #include #include #include #include #include - #include - #include "accessor/math.hpp" -#include "accessor/scaled_reduced_row_major_reference.hpp" #include "accessor/utils.hpp" diff --git a/core/test/base/abstract_factory.cpp b/core/test/base/abstract_factory.cpp index afee76c0ed2..26127301b67 100644 --- a/core/test/base/abstract_factory.cpp +++ b/core/test/base/abstract_factory.cpp @@ -2,11 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include +#include + namespace { diff --git a/core/test/base/allocator.cpp b/core/test/base/allocator.cpp index 69ae72cd398..2dce9babec8 100644 --- a/core/test/base/allocator.cpp +++ b/core/test/base/allocator.cpp @@ -4,10 +4,8 @@ #include "core/base/allocator.hpp" - #include - #include diff --git a/core/test/base/array.cpp b/core/test/base/array.cpp index 59b1ef177f3..71816f690ce 100644 --- a/core/test/base/array.cpp +++ b/core/test/base/array.cpp @@ -2,19 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include - #include - +#include #include - #include "core/base/array_access.hpp" #include "core/test/utils.hpp" diff --git a/core/test/base/batch_dim.cpp b/core/test/base/batch_dim.cpp index 989e153bb75..a8d324ba1d5 100644 --- a/core/test/base/batch_dim.cpp +++ b/core/test/base/batch_dim.cpp @@ -2,14 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include +#include + TEST(BatchDim, ConstructsCorrectUniformObject) { diff --git a/core/test/base/batch_lin_op.cpp b/core/test/base/batch_lin_op.cpp index 6b7a40de3c0..865d7f10314 100644 --- a/core/test/base/batch_lin_op.cpp +++ b/core/test/base/batch_lin_op.cpp @@ -2,17 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include - #include - +#include #include #include #include diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp index 748a52d3227..3798f30ce65 100644 --- a/core/test/base/batch_multi_vector.cpp +++ b/core/test/base/batch_multi_vector.cpp @@ -2,17 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - +#include #include #include #include - #include "core/base/batch_utilities.hpp" #include "core/test/utils.hpp" #include "core/test/utils/batch_helpers.hpp" diff --git a/core/test/base/block_operator.cpp b/core/test/base/block_operator.cpp index 54ac3649d9e..2d2bddb357f 100644 --- a/core/test/base/block_operator.cpp +++ b/core/test/base/block_operator.cpp @@ -2,18 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - +#include #include - #include "core/test/utils.hpp" diff --git a/core/test/base/combination.cpp b/core/test/base/combination.cpp index 1be8324e95b..73c30ffe11c 100644 --- a/core/test/base/combination.cpp +++ b/core/test/base/combination.cpp @@ -2,14 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include +#include #include "core/test/utils.hpp" diff --git a/core/test/base/composition.cpp b/core/test/base/composition.cpp index 686ff23678a..122755b8f92 100644 --- a/core/test/base/composition.cpp +++ b/core/test/base/composition.cpp @@ -2,14 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include +#include #include "core/test/utils.hpp" diff --git a/core/test/base/deferred_factory.cpp b/core/test/base/deferred_factory.cpp index 79bd9672fbd..a1c02103cf8 100644 --- a/core/test/base/deferred_factory.cpp +++ b/core/test/base/deferred_factory.cpp @@ -4,7 +4,6 @@ #include - #include #include #include diff --git a/core/test/base/dense_cache.cpp b/core/test/base/dense_cache.cpp index 35eeaab64f3..526187610a4 100644 --- a/core/test/base/dense_cache.cpp +++ b/core/test/base/dense_cache.cpp @@ -2,15 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - +#include #include - #include "core/test/utils.hpp" diff --git a/core/test/base/dim.cpp b/core/test/base/dim.cpp index 2ea3f65dcf7..168f2bccc01 100644 --- a/core/test/base/dim.cpp +++ b/core/test/base/dim.cpp @@ -2,14 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include +#include + namespace { diff --git a/core/test/base/exception.cpp b/core/test/base/exception.cpp index 45f629351f4..ec5d4bf5763 100644 --- a/core/test/base/exception.cpp +++ b/core/test/base/exception.cpp @@ -2,11 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include +#include + namespace { diff --git a/core/test/base/exception_helpers.cpp b/core/test/base/exception_helpers.cpp index 228389c738a..50f81707ead 100644 --- a/core/test/base/exception_helpers.cpp +++ b/core/test/base/exception_helpers.cpp @@ -2,11 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include +#include + namespace { diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp index dc4ea5aad63..64a11929983 100644 --- a/core/test/base/executor.cpp +++ b/core/test/base/executor.cpp @@ -2,12 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include +#include + #if defined(__unix__) || defined(__APPLE__) #include @@ -16,7 +15,6 @@ #include - #include #include diff --git a/core/test/base/extended_float.cpp b/core/test/base/extended_float.cpp index af31de1ae3c..6148c7c350a 100644 --- a/core/test/base/extended_float.cpp +++ b/core/test/base/extended_float.cpp @@ -4,11 +4,9 @@ #include "core/base/extended_float.hpp" - #include #include - #include diff --git a/core/test/base/index_range.cpp b/core/test/base/index_range.cpp index 8fef94e407f..9845638d446 100644 --- a/core/test/base/index_range.cpp +++ b/core/test/base/index_range.cpp @@ -2,11 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include "core/base/index_range.hpp" +#include + TEST(IRange, KnowsItsProperties) { diff --git a/core/test/base/iterator_factory.cpp b/core/test/base/iterator_factory.cpp index d113f3198f4..42ddff343c0 100644 --- a/core/test/base/iterator_factory.cpp +++ b/core/test/base/iterator_factory.cpp @@ -4,16 +4,13 @@ #include "core/base/iterator_factory.hpp" - #include #include #include #include - #include - #include "core/test/utils.hpp" diff --git a/core/test/base/lin_op.cpp b/core/test/base/lin_op.cpp index 5ffdc640b6b..2496c612193 100644 --- a/core/test/base/lin_op.cpp +++ b/core/test/base/lin_op.cpp @@ -2,18 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include - #include - #include +#include #include diff --git a/core/test/base/math.cpp b/core/test/base/math.cpp index 33aed51d71d..c947f5c0d1b 100644 --- a/core/test/base/math.cpp +++ b/core/test/base/math.cpp @@ -2,17 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include - #include +#include + namespace { diff --git a/core/test/base/matrix_assembly_data.cpp b/core/test/base/matrix_assembly_data.cpp index e65f9079946..d3ed8057659 100644 --- a/core/test/base/matrix_assembly_data.cpp +++ b/core/test/base/matrix_assembly_data.cpp @@ -2,11 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include +#include + namespace { diff --git a/core/test/base/matrix_data.cpp b/core/test/base/matrix_data.cpp index cbc09e99dec..aa63d03dfc2 100644 --- a/core/test/base/matrix_data.cpp +++ b/core/test/base/matrix_data.cpp @@ -2,14 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include +#include + namespace { diff --git a/core/test/base/mtx_io.cpp b/core/test/base/mtx_io.cpp index 3743f7d05b8..66b6766b2d3 100644 --- a/core/test/base/mtx_io.cpp +++ b/core/test/base/mtx_io.cpp @@ -2,22 +2,17 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include - #include - #include #include +#include #include #include - #include "core/test/utils.hpp" diff --git a/core/test/base/perturbation.cpp b/core/test/base/perturbation.cpp index 578c54555aa..e79d52659da 100644 --- a/core/test/base/perturbation.cpp +++ b/core/test/base/perturbation.cpp @@ -2,14 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include +#include + namespace { diff --git a/core/test/base/polymorphic_object.cpp b/core/test/base/polymorphic_object.cpp index 8c6a0a23ef4..ff41a47913b 100644 --- a/core/test/base/polymorphic_object.cpp +++ b/core/test/base/polymorphic_object.cpp @@ -2,11 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include +#include + namespace { diff --git a/core/test/base/range.cpp b/core/test/base/range.cpp index c6a81806d86..76535038b97 100644 --- a/core/test/base/range.cpp +++ b/core/test/base/range.cpp @@ -2,14 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include +#include + namespace { diff --git a/core/test/base/range_accessors.cpp b/core/test/base/range_accessors.cpp index b9758a39d19..51335c98da7 100644 --- a/core/test/base/range_accessors.cpp +++ b/core/test/base/range_accessors.cpp @@ -2,15 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - +#include #include - #include "core/test/utils.hpp" diff --git a/core/test/base/sanitizers.cpp b/core/test/base/sanitizers.cpp index 44792ecc502..bdd02b09575 100644 --- a/core/test/base/sanitizers.cpp +++ b/core/test/base/sanitizers.cpp @@ -6,7 +6,6 @@ #include #include - #include diff --git a/core/test/base/segmented_array.cpp b/core/test/base/segmented_array.cpp index 0c004223b12..2741990036f 100644 --- a/core/test/base/segmented_array.cpp +++ b/core/test/base/segmented_array.cpp @@ -2,13 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/base/segmented_array.hpp" #include +#include -#include "core/base/segmented_array.hpp" #include "core/test/utils.hpp" diff --git a/core/test/base/types.cpp b/core/test/base/types.cpp index e537eba9bc7..507b86cd056 100644 --- a/core/test/base/types.cpp +++ b/core/test/base/types.cpp @@ -2,19 +2,16 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/base/types.hpp" #include #include #include #include - #include - -#include "core/base/types.hpp" +#include namespace { diff --git a/core/test/base/utils.cpp b/core/test/base/utils.cpp index f6f7ff2b046..1ad4705b824 100644 --- a/core/test/base/utils.cpp +++ b/core/test/base/utils.cpp @@ -2,13 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include +#include namespace { diff --git a/core/test/base/version.cpp b/core/test/base/version.cpp index a08fb308e51..2e65f25ea1a 100644 --- a/core/test/base/version.cpp +++ b/core/test/base/version.cpp @@ -2,14 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include +#include + namespace { diff --git a/core/test/components/addressable_pq.cpp b/core/test/components/addressable_pq.cpp index 834440b45fb..6301cd44fb4 100644 --- a/core/test/components/addressable_pq.cpp +++ b/core/test/components/addressable_pq.cpp @@ -4,17 +4,13 @@ #include "core/components/addressable_pq.hpp" - #include #include - #include - #include - #include "core/test/utils.hpp" diff --git a/core/test/components/disjoint_sets.cpp b/core/test/components/disjoint_sets.cpp index e23fb75dcb9..a014d47cbb1 100644 --- a/core/test/components/disjoint_sets.cpp +++ b/core/test/components/disjoint_sets.cpp @@ -4,18 +4,14 @@ #include "core/components/disjoint_sets.hpp" - #include #include #include - #include - #include - #include "core/test/utils.hpp" diff --git a/core/test/config/config.cpp b/core/test/config/config.cpp index 163f6936de2..d5fed0f90c3 100644 --- a/core/test/config/config.cpp +++ b/core/test/config/config.cpp @@ -2,13 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include +#include #include #include #include @@ -16,7 +13,6 @@ #include #include - #include "core/config/config_helper.hpp" #include "core/test/utils.hpp" diff --git a/core/test/config/factorization.cpp b/core/test/config/factorization.cpp index 405641265a2..f5a4b19d3d9 100644 --- a/core/test/config/factorization.cpp +++ b/core/test/config/factorization.cpp @@ -4,10 +4,8 @@ #include - #include - #include #include #include @@ -22,7 +20,6 @@ #include #include - #include "core/config/config_helper.hpp" #include "core/config/registry_accessor.hpp" #include "core/test/utils.hpp" diff --git a/core/test/config/multigrid.cpp b/core/test/config/multigrid.cpp index 5c13d0525cf..5eb8e622088 100644 --- a/core/test/config/multigrid.cpp +++ b/core/test/config/multigrid.cpp @@ -4,10 +4,8 @@ #include - #include - #include #include #include @@ -16,7 +14,6 @@ #include #include - #include "core/config/config_helper.hpp" #include "core/config/registry_accessor.hpp" #include "core/test/utils.hpp" diff --git a/core/test/config/preconditioner.cpp b/core/test/config/preconditioner.cpp index d404a97b4b8..b11ea3b6705 100644 --- a/core/test/config/preconditioner.cpp +++ b/core/test/config/preconditioner.cpp @@ -4,10 +4,8 @@ #include - #include - #include #include #include @@ -18,7 +16,6 @@ #include #include - #include "core/config/config_helper.hpp" #include "core/config/registry_accessor.hpp" #include "core/test/utils.hpp" diff --git a/core/test/config/property_tree.cpp b/core/test/config/property_tree.cpp index a552a6c08d8..8fe49aa6926 100644 --- a/core/test/config/property_tree.cpp +++ b/core/test/config/property_tree.cpp @@ -2,19 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include - #include - #include +#include using namespace gko::config; diff --git a/core/test/config/registry.cpp b/core/test/config/registry.cpp index e6fc8eef671..a8d1acb1cf8 100644 --- a/core/test/config/registry.cpp +++ b/core/test/config/registry.cpp @@ -2,21 +2,17 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include +#include #include #include #include #include #include - #include "core/config/config_helper.hpp" #include "core/config/registry_accessor.hpp" #include "core/test/utils.hpp" diff --git a/core/test/config/solver.cpp b/core/test/config/solver.cpp index b40c4dc1781..8a2f025d00a 100644 --- a/core/test/config/solver.cpp +++ b/core/test/config/solver.cpp @@ -4,10 +4,8 @@ #include - #include - #include #include #include @@ -25,7 +23,6 @@ #include #include - #include "core/config/config_helper.hpp" #include "core/config/registry_accessor.hpp" #include "core/test/utils.hpp" diff --git a/core/test/config/type_descriptor.cpp b/core/test/config/type_descriptor.cpp index a387ebe44b7..ff519e88101 100644 --- a/core/test/config/type_descriptor.cpp +++ b/core/test/config/type_descriptor.cpp @@ -2,11 +2,9 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include +#include #include "core/config/type_descriptor_helper.hpp" #include "core/test/utils.hpp" diff --git a/core/test/distributed/index_map.cpp b/core/test/distributed/index_map.cpp index 521a2e2d094..8602bb025f5 100644 --- a/core/test/distributed/index_map.cpp +++ b/core/test/distributed/index_map.cpp @@ -2,14 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - +#include #include "core/test/utils.hpp" diff --git a/core/test/factorization/elimination_forest.cpp b/core/test/factorization/elimination_forest.cpp index 3cbe5f3ae7b..292b366f50e 100644 --- a/core/test/factorization/elimination_forest.cpp +++ b/core/test/factorization/elimination_forest.cpp @@ -4,17 +4,13 @@ #include "core/factorization/elimination_forest.hpp" - #include #include - #include - #include - #include "core/test/utils.hpp" #include "matrices/config.hpp" diff --git a/core/test/factorization/par_ic.cpp b/core/test/factorization/par_ic.cpp index c580b5ea139..d6de0f9fc98 100644 --- a/core/test/factorization/par_ic.cpp +++ b/core/test/factorization/par_ic.cpp @@ -2,14 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - +#include #include "core/test/utils.hpp" diff --git a/core/test/factorization/par_ict.cpp b/core/test/factorization/par_ict.cpp index 10e0dd7b99c..07eec8db549 100644 --- a/core/test/factorization/par_ict.cpp +++ b/core/test/factorization/par_ict.cpp @@ -2,14 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - +#include #include "core/test/utils.hpp" diff --git a/core/test/factorization/par_ilu.cpp b/core/test/factorization/par_ilu.cpp index f3904093024..a0b8f37e3d4 100644 --- a/core/test/factorization/par_ilu.cpp +++ b/core/test/factorization/par_ilu.cpp @@ -2,14 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - +#include #include "core/test/utils.hpp" diff --git a/core/test/factorization/par_ilut.cpp b/core/test/factorization/par_ilut.cpp index 4d5e8ea88d8..ad466e62407 100644 --- a/core/test/factorization/par_ilut.cpp +++ b/core/test/factorization/par_ilut.cpp @@ -2,14 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - +#include #include "core/test/utils.hpp" diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp index 269845c3e9c..01250c41929 100644 --- a/core/test/gtest/environments.hpp +++ b/core/test/gtest/environments.hpp @@ -10,15 +10,12 @@ #include #include - #include - #include #include #include - #include "core/test/gtest/resources.hpp" diff --git a/core/test/gtest/ginkgo_main.cpp b/core/test/gtest/ginkgo_main.cpp index d0e5560be84..7ba3d80c52c 100644 --- a/core/test/gtest/ginkgo_main.cpp +++ b/core/test/gtest/ginkgo_main.cpp @@ -4,7 +4,6 @@ #include - #include "core/test/gtest/environments.hpp" diff --git a/core/test/gtest/ginkgo_mpi_main.cpp b/core/test/gtest/ginkgo_mpi_main.cpp index a94cdc710a5..07a1c2c343d 100644 --- a/core/test/gtest/ginkgo_mpi_main.cpp +++ b/core/test/gtest/ginkgo_mpi_main.cpp @@ -16,16 +16,12 @@ #include #include - #include - #include - #include - #include "core/test/gtest/environments.hpp" diff --git a/core/test/gtest/resources.cpp b/core/test/gtest/resources.cpp index be7d37efd9e..62dee2b014f 100644 --- a/core/test/gtest/resources.cpp +++ b/core/test/gtest/resources.cpp @@ -2,14 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/test/gtest/resources.hpp" + #include #include #include -#include "core/test/gtest/resources.hpp" - - #ifdef GKO_COMPILING_OMP #include #endif diff --git a/core/test/log/convergence.cpp b/core/test/log/convergence.cpp index 944549346ef..8fff0c17b8e 100644 --- a/core/test/log/convergence.cpp +++ b/core/test/log/convergence.cpp @@ -2,16 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include +#include #include - #include "core/test/utils.hpp" diff --git a/core/test/log/logger.cpp b/core/test/log/logger.cpp index 18315442559..b065db66768 100644 --- a/core/test/log/logger.cpp +++ b/core/test/log/logger.cpp @@ -8,16 +8,12 @@ GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS // clang-format on -#include - - #include - #include - #include +#include #include #include diff --git a/core/test/log/papi.cpp b/core/test/log/papi.cpp index bab3d8f3ff7..8278120cc49 100644 --- a/core/test/log/papi.cpp +++ b/core/test/log/papi.cpp @@ -2,22 +2,17 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include - #include +#include #include #include #include - #include "core/test/utils.hpp" diff --git a/core/test/log/performance_hint.cpp b/core/test/log/performance_hint.cpp index 6bef7ca24c3..eaac858e378 100644 --- a/core/test/log/performance_hint.cpp +++ b/core/test/log/performance_hint.cpp @@ -2,18 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include - #include - #include - +#include #include "core/test/utils.hpp" diff --git a/core/test/log/profiler_hook.cpp b/core/test/log/profiler_hook.cpp index 4ace584670f..40bd6394475 100644 --- a/core/test/log/profiler_hook.cpp +++ b/core/test/log/profiler_hook.cpp @@ -2,22 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/log/profiler_hook.hpp" #include #include - #include - #include +#include #include #include - -#include "core/log/profiler_hook.hpp" #include "core/test/utils.hpp" diff --git a/core/test/log/record.cpp b/core/test/log/record.cpp index f8595c0c0b9..b81bd7b899e 100644 --- a/core/test/log/record.cpp +++ b/core/test/log/record.cpp @@ -2,18 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include +#include #include #include - #include "core/test/utils/assertions.hpp" diff --git a/core/test/log/stream.cpp b/core/test/log/stream.cpp index 7ac1dca2cfe..995a9975b89 100644 --- a/core/test/log/stream.cpp +++ b/core/test/log/stream.cpp @@ -2,23 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include - #include - #include +#include #include #include #include - #include "core/test/utils.hpp" diff --git a/core/test/matrix/batch_csr.cpp b/core/test/matrix/batch_csr.cpp index bae7d08c837..57cae53d646 100644 --- a/core/test/matrix/batch_csr.cpp +++ b/core/test/matrix/batch_csr.cpp @@ -2,17 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include +#include #include - #include "core/base/batch_utilities.hpp" #include "core/test/utils.hpp" #include "core/test/utils/batch_helpers.hpp" diff --git a/core/test/matrix/batch_dense.cpp b/core/test/matrix/batch_dense.cpp index 892d9a36b1f..334df5c0e93 100644 --- a/core/test/matrix/batch_dense.cpp +++ b/core/test/matrix/batch_dense.cpp @@ -2,18 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include #include +#include #include - #include "core/base/batch_utilities.hpp" #include "core/test/utils.hpp" #include "core/test/utils/batch_helpers.hpp" diff --git a/core/test/matrix/batch_ell.cpp b/core/test/matrix/batch_ell.cpp index b455364933e..11f6381a43d 100644 --- a/core/test/matrix/batch_ell.cpp +++ b/core/test/matrix/batch_ell.cpp @@ -2,17 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include +#include #include - #include "core/base/batch_utilities.hpp" #include "core/test/utils.hpp" #include "core/test/utils/batch_helpers.hpp" diff --git a/core/test/matrix/batch_identity.cpp b/core/test/matrix/batch_identity.cpp index fdde07b6919..dd7a3675110 100644 --- a/core/test/matrix/batch_identity.cpp +++ b/core/test/matrix/batch_identity.cpp @@ -2,18 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include #include +#include #include - #include "core/base/batch_utilities.hpp" #include "core/test/utils.hpp" #include "core/test/utils/batch_helpers.hpp" diff --git a/core/test/matrix/coo.cpp b/core/test/matrix/coo.cpp index b4365138860..ffb8d5aee9f 100644 --- a/core/test/matrix/coo.cpp +++ b/core/test/matrix/coo.cpp @@ -2,11 +2,9 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include +#include #include "core/test/utils.hpp" diff --git a/core/test/matrix/coo_builder.cpp b/core/test/matrix/coo_builder.cpp index 1c0bf020d79..9bfae5cf3af 100644 --- a/core/test/matrix/coo_builder.cpp +++ b/core/test/matrix/coo_builder.cpp @@ -4,13 +4,10 @@ #include "core/matrix/coo_builder.hpp" - #include - #include - #include "core/test/utils.hpp" diff --git a/core/test/matrix/csr.cpp b/core/test/matrix/csr.cpp index 6dc4477c101..4bbdc63851a 100644 --- a/core/test/matrix/csr.cpp +++ b/core/test/matrix/csr.cpp @@ -2,14 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - +#include #include "core/test/utils.hpp" diff --git a/core/test/matrix/csr_builder.cpp b/core/test/matrix/csr_builder.cpp index e28e17999b8..a06437bed12 100644 --- a/core/test/matrix/csr_builder.cpp +++ b/core/test/matrix/csr_builder.cpp @@ -4,13 +4,10 @@ #include "core/matrix/csr_builder.hpp" - #include - #include - #include "core/test/utils.hpp" diff --git a/core/test/matrix/dense.cpp b/core/test/matrix/dense.cpp index 9a2564b2ae8..e7158a15aed 100644 --- a/core/test/matrix/dense.cpp +++ b/core/test/matrix/dense.cpp @@ -2,15 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include - +#include #include "core/test/utils.hpp" diff --git a/core/test/matrix/diagonal.cpp b/core/test/matrix/diagonal.cpp index 3eb7ab66091..de03a9350bb 100644 --- a/core/test/matrix/diagonal.cpp +++ b/core/test/matrix/diagonal.cpp @@ -2,11 +2,9 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include +#include #include "core/test/utils.hpp" diff --git a/core/test/matrix/ell.cpp b/core/test/matrix/ell.cpp index add0a5ad677..bcc2b591a50 100644 --- a/core/test/matrix/ell.cpp +++ b/core/test/matrix/ell.cpp @@ -2,11 +2,9 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include +#include #include "core/test/utils.hpp" diff --git a/core/test/matrix/fbcsr.cpp b/core/test/matrix/fbcsr.cpp index 8e2c4a5808b..3d3d4ee738d 100644 --- a/core/test/matrix/fbcsr.cpp +++ b/core/test/matrix/fbcsr.cpp @@ -2,18 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include - #include - #include - +#include #include "accessor/block_col_major.hpp" #include "accessor/range.hpp" diff --git a/core/test/matrix/fbcsr_builder.cpp b/core/test/matrix/fbcsr_builder.cpp index 3f1e915319e..d91a0c7b70a 100644 --- a/core/test/matrix/fbcsr_builder.cpp +++ b/core/test/matrix/fbcsr_builder.cpp @@ -4,13 +4,10 @@ #include "core/matrix/fbcsr_builder.hpp" - #include - #include - #include "core/test/utils.hpp" diff --git a/core/test/matrix/fbcsr_sample.hpp b/core/test/matrix/fbcsr_sample.hpp index 618cc2a2456..d84cbd31f19 100644 --- a/core/test/matrix/fbcsr_sample.hpp +++ b/core/test/matrix/fbcsr_sample.hpp @@ -13,7 +13,6 @@ #include #include - #include "accessor/block_col_major.hpp" #include "accessor/range.hpp" #include "core/test/utils.hpp" diff --git a/core/test/matrix/hybrid.cpp b/core/test/matrix/hybrid.cpp index 10b5603c75c..d1a69312755 100644 --- a/core/test/matrix/hybrid.cpp +++ b/core/test/matrix/hybrid.cpp @@ -2,11 +2,9 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include +#include #include "core/test/utils.hpp" diff --git a/core/test/matrix/identity.cpp b/core/test/matrix/identity.cpp index 69370df07c5..bcf9c036992 100644 --- a/core/test/matrix/identity.cpp +++ b/core/test/matrix/identity.cpp @@ -7,15 +7,11 @@ // clang-format on -#include - - #include - #include #include - +#include #include "core/test/utils.hpp" diff --git a/core/test/matrix/permutation.cpp b/core/test/matrix/permutation.cpp index 1412e2924af..edb1532696b 100644 --- a/core/test/matrix/permutation.cpp +++ b/core/test/matrix/permutation.cpp @@ -2,17 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include #include #include - +#include #include "core/test/utils.hpp" diff --git a/core/test/matrix/row_gatherer.cpp b/core/test/matrix/row_gatherer.cpp index e8c15e454d2..801f639c206 100644 --- a/core/test/matrix/row_gatherer.cpp +++ b/core/test/matrix/row_gatherer.cpp @@ -2,17 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include #include #include - +#include #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" diff --git a/core/test/matrix/sellp.cpp b/core/test/matrix/sellp.cpp index 0160a329ddf..123d7bae773 100644 --- a/core/test/matrix/sellp.cpp +++ b/core/test/matrix/sellp.cpp @@ -2,11 +2,9 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include +#include #include "core/test/utils.hpp" diff --git a/core/test/matrix/sparsity_csr.cpp b/core/test/matrix/sparsity_csr.cpp index dca3ef8b5d6..e929f960f1e 100644 --- a/core/test/matrix/sparsity_csr.cpp +++ b/core/test/matrix/sparsity_csr.cpp @@ -2,18 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include - +#include #include "core/test/utils.hpp" diff --git a/core/test/mpi/base/bindings.cpp b/core/test/mpi/base/bindings.cpp index fc715d37782..d3ecf359908 100644 --- a/core/test/mpi/base/bindings.cpp +++ b/core/test/mpi/base/bindings.cpp @@ -4,17 +4,14 @@ #include - #include - #include #include #include #include #include - #include "core/test/utils.hpp" diff --git a/core/test/mpi/base/communicator.cpp b/core/test/mpi/base/communicator.cpp index eaf05e148b7..a010b92b935 100644 --- a/core/test/mpi/base/communicator.cpp +++ b/core/test/mpi/base/communicator.cpp @@ -4,10 +4,8 @@ #include - #include - #include #include diff --git a/core/test/mpi/base/exception_helpers.cpp b/core/test/mpi/base/exception_helpers.cpp index 04a13a3422b..a8b74b04ba5 100644 --- a/core/test/mpi/base/exception_helpers.cpp +++ b/core/test/mpi/base/exception_helpers.cpp @@ -4,10 +4,8 @@ #include - #include - #include #include diff --git a/core/test/mpi/base/polymorphic_object.cpp b/core/test/mpi/base/polymorphic_object.cpp index cd55b7c533f..0c00f2af468 100644 --- a/core/test/mpi/base/polymorphic_object.cpp +++ b/core/test/mpi/base/polymorphic_object.cpp @@ -4,7 +4,6 @@ #include - #include diff --git a/core/test/mpi/base/rank_mapping.cpp b/core/test/mpi/base/rank_mapping.cpp index 6588ce5abcd..97999163035 100644 --- a/core/test/mpi/base/rank_mapping.cpp +++ b/core/test/mpi/base/rank_mapping.cpp @@ -4,13 +4,10 @@ #include - #include - #include - #include "core/test/utils.hpp" diff --git a/core/test/mpi/distributed/helpers.cpp b/core/test/mpi/distributed/helpers.cpp index 69c11f7e0ae..a2349ce395d 100644 --- a/core/test/mpi/distributed/helpers.cpp +++ b/core/test/mpi/distributed/helpers.cpp @@ -2,13 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "core/distributed/helpers.hpp" +#include #include - -#include "core/distributed/helpers.hpp" #include "core/test/utils.hpp" diff --git a/core/test/mpi/distributed/matrix.cpp b/core/test/mpi/distributed/matrix.cpp index 48a7d3e2f17..4062393564c 100644 --- a/core/test/mpi/distributed/matrix.cpp +++ b/core/test/mpi/distributed/matrix.cpp @@ -4,7 +4,6 @@ #include - #include #include #include @@ -17,7 +16,6 @@ #include #include - #include "core/test/utils.hpp" diff --git a/core/test/mpi/distributed/preconditioner/schwarz.cpp b/core/test/mpi/distributed/preconditioner/schwarz.cpp index 457303e8285..c6c0dc00650 100644 --- a/core/test/mpi/distributed/preconditioner/schwarz.cpp +++ b/core/test/mpi/distributed/preconditioner/schwarz.cpp @@ -4,13 +4,11 @@ #include - #include #include #include #include - #include "core/test/utils.hpp" diff --git a/core/test/mpi/distributed/solver/multigrid.cpp b/core/test/mpi/distributed/solver/multigrid.cpp index c654edff30c..c8ab6f6d284 100644 --- a/core/test/mpi/distributed/solver/multigrid.cpp +++ b/core/test/mpi/distributed/solver/multigrid.cpp @@ -4,7 +4,6 @@ #include - #include #include #include @@ -13,7 +12,6 @@ #include #include - #include "core/test/utils.hpp" diff --git a/core/test/multigrid/fixed_coarsening.cpp b/core/test/multigrid/fixed_coarsening.cpp index 7e27a6a4eed..5cab7282b5d 100644 --- a/core/test/multigrid/fixed_coarsening.cpp +++ b/core/test/multigrid/fixed_coarsening.cpp @@ -2,17 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include - +#include #include "core/test/utils.hpp" diff --git a/core/test/multigrid/pgm.cpp b/core/test/multigrid/pgm.cpp index ca1c409753a..7798e97f5d6 100644 --- a/core/test/multigrid/pgm.cpp +++ b/core/test/multigrid/pgm.cpp @@ -2,17 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include - +#include #include "core/test/utils.hpp" diff --git a/core/test/preconditioner/batch_jacobi.cpp b/core/test/preconditioner/batch_jacobi.cpp index 08ccedb2f3f..f9c5f5fe124 100644 --- a/core/test/preconditioner/batch_jacobi.cpp +++ b/core/test/preconditioner/batch_jacobi.cpp @@ -2,16 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include +#include #include diff --git a/core/test/preconditioner/ic.cpp b/core/test/preconditioner/ic.cpp index d290dbfd463..fc02e800052 100644 --- a/core/test/preconditioner/ic.cpp +++ b/core/test/preconditioner/ic.cpp @@ -10,15 +10,12 @@ GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS #include - #include - #include #include #include - #include "core/test/utils.hpp" diff --git a/core/test/preconditioner/ilu.cpp b/core/test/preconditioner/ilu.cpp index f3e38702399..08806a4e92c 100644 --- a/core/test/preconditioner/ilu.cpp +++ b/core/test/preconditioner/ilu.cpp @@ -10,15 +10,12 @@ GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS #include - #include - #include #include #include - #include "core/test/utils.hpp" diff --git a/core/test/preconditioner/isai.cpp b/core/test/preconditioner/isai.cpp index ff597b83bf1..b5e7400d0e8 100644 --- a/core/test/preconditioner/isai.cpp +++ b/core/test/preconditioner/isai.cpp @@ -2,20 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include +#include #include - #include "core/test/utils.hpp" diff --git a/core/test/preconditioner/jacobi.cpp b/core/test/preconditioner/jacobi.cpp index 44b53f520c4..8813b4c3c4d 100644 --- a/core/test/preconditioner/jacobi.cpp +++ b/core/test/preconditioner/jacobi.cpp @@ -2,14 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - +#include #include "core/test/utils.hpp" diff --git a/core/test/reorder/amd.cpp b/core/test/reorder/amd.cpp index e1ae5360aee..9eecf3777e1 100644 --- a/core/test/reorder/amd.cpp +++ b/core/test/reorder/amd.cpp @@ -2,19 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include - #include - #include - +#include #include "core/factorization/symbolic.hpp" #include "core/test/utils.hpp" diff --git a/core/test/reorder/nested_dissection.cpp b/core/test/reorder/nested_dissection.cpp index 88b39cd4e87..fc6d7e3a06a 100644 --- a/core/test/reorder/nested_dissection.cpp +++ b/core/test/reorder/nested_dissection.cpp @@ -2,17 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include - +#include #include "core/test/utils.hpp" diff --git a/core/test/reorder/rcm.cpp b/core/test/reorder/rcm.cpp index 544628c191a..e1ca032b64f 100644 --- a/core/test/reorder/rcm.cpp +++ b/core/test/reorder/rcm.cpp @@ -2,17 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include - +#include #include "core/test/utils.hpp" diff --git a/core/test/reorder/scaled_reordered.cpp b/core/test/reorder/scaled_reordered.cpp index 7bc8452e907..8a4c12ca232 100644 --- a/core/test/reorder/scaled_reordered.cpp +++ b/core/test/reorder/scaled_reordered.cpp @@ -2,19 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include #include #include +#include #include diff --git a/core/test/solver/batch_bicgstab.cpp b/core/test/solver/batch_bicgstab.cpp index 9ff775e7d37..cd9446d07b2 100644 --- a/core/test/solver/batch_bicgstab.cpp +++ b/core/test/solver/batch_bicgstab.cpp @@ -2,16 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include #include - +#include #include "core/base/batch_utilities.hpp" #include "core/test/utils.hpp" diff --git a/core/test/solver/batch_cg.cpp b/core/test/solver/batch_cg.cpp index f890f26c7ae..1e97c765f8a 100644 --- a/core/test/solver/batch_cg.cpp +++ b/core/test/solver/batch_cg.cpp @@ -2,16 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include #include - +#include #include "core/base/batch_utilities.hpp" #include "core/test/utils.hpp" diff --git a/core/test/solver/bicg.cpp b/core/test/solver/bicg.cpp index d983808139f..e5a40e0c4f8 100644 --- a/core/test/solver/bicg.cpp +++ b/core/test/solver/bicg.cpp @@ -2,22 +2,17 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include +#include #include #include #include - #include "core/test/utils.hpp" diff --git a/core/test/solver/bicgstab.cpp b/core/test/solver/bicgstab.cpp index 6dc92670f52..f8b8d3c7b05 100644 --- a/core/test/solver/bicgstab.cpp +++ b/core/test/solver/bicgstab.cpp @@ -2,20 +2,16 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include +#include #include #include #include #include - #include "core/test/utils.hpp" diff --git a/core/test/solver/cb_gmres.cpp b/core/test/solver/cb_gmres.cpp index 19996fe0275..21600ed2b70 100644 --- a/core/test/solver/cb_gmres.cpp +++ b/core/test/solver/cb_gmres.cpp @@ -2,22 +2,17 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include +#include #include #include #include - #include "core/test/utils.hpp" diff --git a/core/test/solver/cg.cpp b/core/test/solver/cg.cpp index 3261188285b..cbf637de302 100644 --- a/core/test/solver/cg.cpp +++ b/core/test/solver/cg.cpp @@ -2,22 +2,17 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include +#include #include #include #include - #include "core/test/utils.hpp" diff --git a/core/test/solver/cgs.cpp b/core/test/solver/cgs.cpp index e76c40ab9f8..5dc80892a1b 100644 --- a/core/test/solver/cgs.cpp +++ b/core/test/solver/cgs.cpp @@ -2,22 +2,17 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include +#include #include #include #include - #include "core/test/utils.hpp" diff --git a/core/test/solver/direct.cpp b/core/test/solver/direct.cpp index a7df3e68a92..d895892a8be 100644 --- a/core/test/solver/direct.cpp +++ b/core/test/solver/direct.cpp @@ -2,18 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include - +#include #include "core/test/utils.hpp" diff --git a/core/test/solver/fcg.cpp b/core/test/solver/fcg.cpp index ec1b4ded76f..2898a5f5c46 100644 --- a/core/test/solver/fcg.cpp +++ b/core/test/solver/fcg.cpp @@ -2,19 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include +#include #include #include #include - #include "core/test/utils.hpp" diff --git a/core/test/solver/gcr.cpp b/core/test/solver/gcr.cpp index 29ef0e1f578..2d7b5ea7974 100644 --- a/core/test/solver/gcr.cpp +++ b/core/test/solver/gcr.cpp @@ -2,22 +2,17 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include +#include #include #include #include - #include "core/test/utils.hpp" diff --git a/core/test/solver/gmres.cpp b/core/test/solver/gmres.cpp index a2bbd523bce..5d9c9e3c40e 100644 --- a/core/test/solver/gmres.cpp +++ b/core/test/solver/gmres.cpp @@ -2,22 +2,17 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include +#include #include #include #include - #include "core/test/utils.hpp" diff --git a/core/test/solver/idr.cpp b/core/test/solver/idr.cpp index 1fe6c651a23..9eb79356046 100644 --- a/core/test/solver/idr.cpp +++ b/core/test/solver/idr.cpp @@ -2,20 +2,16 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include +#include #include #include #include #include - #include "core/test/utils.hpp" diff --git a/core/test/solver/ir.cpp b/core/test/solver/ir.cpp index e3d54d9c894..1137862a395 100644 --- a/core/test/solver/ir.cpp +++ b/core/test/solver/ir.cpp @@ -2,23 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include #include +#include #include #include #include - #include "core/test/utils.hpp" diff --git a/core/test/solver/lower_trs.cpp b/core/test/solver/lower_trs.cpp index a218072bdb0..dfcb564ca12 100644 --- a/core/test/solver/lower_trs.cpp +++ b/core/test/solver/lower_trs.cpp @@ -4,14 +4,11 @@ #include - #include - #include #include - #include "core/test/utils.hpp" diff --git a/core/test/solver/multigrid.cpp b/core/test/solver/multigrid.cpp index 6d38016099e..8cb545f6cb2 100644 --- a/core/test/solver/multigrid.cpp +++ b/core/test/solver/multigrid.cpp @@ -2,24 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include #include #include #include +#include #include #include - #include "core/test/utils.hpp" diff --git a/core/test/solver/upper_trs.cpp b/core/test/solver/upper_trs.cpp index 425d869156c..2e84cb81e10 100644 --- a/core/test/solver/upper_trs.cpp +++ b/core/test/solver/upper_trs.cpp @@ -4,14 +4,11 @@ #include - #include - #include #include - #include "core/test/utils.hpp" diff --git a/core/test/solver/workspace.cpp b/core/test/solver/workspace.cpp index 1c8996aeb65..3126cc67501 100644 --- a/core/test/solver/workspace.cpp +++ b/core/test/solver/workspace.cpp @@ -2,18 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include - #include - +#include #include "core/test/utils.hpp" diff --git a/core/test/stop/combined.cpp b/core/test/stop/combined.cpp index 401cd63fb34..2995414a7b0 100644 --- a/core/test/stop/combined.cpp +++ b/core/test/stop/combined.cpp @@ -2,16 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include - #include - +#include #include #include diff --git a/core/test/stop/criterion.cpp b/core/test/stop/criterion.cpp index 700f1829dfb..ce555d01969 100644 --- a/core/test/stop/criterion.cpp +++ b/core/test/stop/criterion.cpp @@ -2,11 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include +#include + namespace { diff --git a/core/test/stop/iteration.cpp b/core/test/stop/iteration.cpp index de36e2107b4..e538885e5d6 100644 --- a/core/test/stop/iteration.cpp +++ b/core/test/stop/iteration.cpp @@ -2,11 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include +#include + namespace { diff --git a/core/test/stop/stopping_status.cpp b/core/test/stop/stopping_status.cpp index 4e6046568a8..46b3ce86c42 100644 --- a/core/test/stop/stopping_status.cpp +++ b/core/test/stop/stopping_status.cpp @@ -2,11 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include +#include + namespace { diff --git a/core/test/stop/time.cpp b/core/test/stop/time.cpp index fb08055b2b3..60a22b79ad3 100644 --- a/core/test/stop/time.cpp +++ b/core/test/stop/time.cpp @@ -2,15 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include - #include +#include + namespace { diff --git a/core/test/utils.hpp b/core/test/utils.hpp index c6ce7c273d1..43ded30cde5 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -13,16 +13,13 @@ #include #include - #include - #include #include #include #include - #include "core/base/extended_float.hpp" #include "core/test/utils/array_generator.hpp" #include "core/test/utils/assertions.hpp" diff --git a/core/test/utils/array_generator.hpp b/core/test/utils/array_generator.hpp index a70751a12de..83b9018939e 100644 --- a/core/test/utils/array_generator.hpp +++ b/core/test/utils/array_generator.hpp @@ -10,7 +10,6 @@ #include #include - #include "core/test/utils/value_generator.hpp" diff --git a/core/test/utils/array_generator_test.cpp b/core/test/utils/array_generator_test.cpp index 21e617e2937..ae66e4686da 100644 --- a/core/test/utils/array_generator_test.cpp +++ b/core/test/utils/array_generator_test.cpp @@ -4,14 +4,11 @@ #include "core/test/utils/array_generator.hpp" - #include #include - #include - #include "core/test/utils.hpp" namespace { diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index e1194ba72a5..7bdc71ea94e 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -17,10 +17,8 @@ #include #include - #include - #include #include #include @@ -28,7 +26,6 @@ #include #include - #include "core/base/batch_utilities.hpp" #include "core/base/extended_float.hpp" diff --git a/core/test/utils/assertions_test.cpp b/core/test/utils/assertions_test.cpp index b9129ea52b6..73900397fbe 100644 --- a/core/test/utils/assertions_test.cpp +++ b/core/test/utils/assertions_test.cpp @@ -4,13 +4,10 @@ #include "core/test/utils/assertions.hpp" - #include - #include - #include #include diff --git a/core/test/utils/batch_helpers.hpp b/core/test/utils/batch_helpers.hpp index 94f4163a387..eff6626de31 100644 --- a/core/test/utils/batch_helpers.hpp +++ b/core/test/utils/batch_helpers.hpp @@ -9,14 +9,12 @@ #include #include - #include #include #include #include #include - #include "core/test/utils/assertions.hpp" #include "core/test/utils/matrix_generator.hpp" #include "core/utils/matrix_utils.hpp" diff --git a/core/test/utils/fb_matrix_generator.hpp b/core/test/utils/fb_matrix_generator.hpp index 0b41151d807..034dd95fce1 100644 --- a/core/test/utils/fb_matrix_generator.hpp +++ b/core/test/utils/fb_matrix_generator.hpp @@ -11,13 +11,11 @@ #include #include - #include #include #include #include - #include "core/factorization/factorization_kernels.hpp" #include "core/test/utils/matrix_generator.hpp" #include "core/test/utils/unsort_matrix.hpp" diff --git a/core/test/utils/fb_matrix_generator_test.cpp b/core/test/utils/fb_matrix_generator_test.cpp index 6dd93b55c58..ccbb0aa477f 100644 --- a/core/test/utils/fb_matrix_generator_test.cpp +++ b/core/test/utils/fb_matrix_generator_test.cpp @@ -4,16 +4,13 @@ #include "core/test/utils/fb_matrix_generator.hpp" - #include #include #include #include - #include - #include "accessor/block_col_major.hpp" #include "core/base/utils.hpp" #include "core/test/utils/matrix_generator.hpp" diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp index 33eb6dd0b49..56ff38c520d 100644 --- a/core/test/utils/matrix_generator.hpp +++ b/core/test/utils/matrix_generator.hpp @@ -13,13 +13,11 @@ #include #include - #include #include #include #include - #include "core/test/utils/value_generator.hpp" diff --git a/core/test/utils/matrix_generator_test.cpp b/core/test/utils/matrix_generator_test.cpp index 9a59c999c9d..43756bc1709 100644 --- a/core/test/utils/matrix_generator_test.cpp +++ b/core/test/utils/matrix_generator_test.cpp @@ -4,14 +4,11 @@ #include "core/test/utils/matrix_generator.hpp" - #include #include - #include - #include "core/base/utils.hpp" #include "core/test/utils.hpp" diff --git a/core/test/utils/matrix_utils_test.cpp b/core/test/utils/matrix_utils_test.cpp index 58463eb361a..3c67571e1b2 100644 --- a/core/test/utils/matrix_utils_test.cpp +++ b/core/test/utils/matrix_utils_test.cpp @@ -4,18 +4,14 @@ #include "core/utils/matrix_utils.hpp" - #include #include #include - #include - #include - #include "core/test/utils.hpp" #include "core/test/utils/matrix_generator.hpp" diff --git a/core/test/utils/unsort_matrix.hpp b/core/test/utils/unsort_matrix.hpp index 1b1a403bee2..b721597b634 100644 --- a/core/test/utils/unsort_matrix.hpp +++ b/core/test/utils/unsort_matrix.hpp @@ -9,13 +9,11 @@ #include #include - #include #include #include #include - #include "core/base/iterator_factory.hpp" diff --git a/core/test/utils/unsort_matrix_test.cpp b/core/test/utils/unsort_matrix_test.cpp index d402b0381cb..5d2f88f982a 100644 --- a/core/test/utils/unsort_matrix_test.cpp +++ b/core/test/utils/unsort_matrix_test.cpp @@ -4,15 +4,12 @@ #include "core/test/utils/unsort_matrix.hpp" - #include #include #include - #include - #include #include #include @@ -20,7 +17,6 @@ #include #include - #include "core/test/utils.hpp" diff --git a/core/test/utils/value_generator.hpp b/core/test/utils/value_generator.hpp index 0c6b7140b8b..f18f2170c96 100644 --- a/core/test/utils/value_generator.hpp +++ b/core/test/utils/value_generator.hpp @@ -9,7 +9,6 @@ #include #include - #include diff --git a/core/test/utils/value_generator_test.cpp b/core/test/utils/value_generator_test.cpp index 4f905ce3516..633565a66ef 100644 --- a/core/test/utils/value_generator_test.cpp +++ b/core/test/utils/value_generator_test.cpp @@ -4,15 +4,12 @@ #include "core/test/utils/value_generator.hpp" - #include #include #include - #include - #include "core/test/utils.hpp" diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu index dcaafd5a46c..704192d0bff 100644 --- a/cuda/base/batch_multi_vector_kernels.cu +++ b/cuda/base/batch_multi_vector_kernels.cu @@ -4,15 +4,12 @@ #include "core/base/batch_multi_vector_kernels.hpp" - #include #include - #include #include - #include "common/cuda_hip/base/blas_bindings.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/pointer_mode_guard.hpp" diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp index 5251c594d42..9f07b6b4532 100644 --- a/cuda/base/batch_struct.hpp +++ b/cuda/base/batch_struct.hpp @@ -9,7 +9,6 @@ #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" #include "core/base/batch_struct.hpp" diff --git a/cuda/base/config.hpp b/cuda/base/config.hpp index 44c304bde5d..1ff249066bd 100644 --- a/cuda/base/config.hpp +++ b/cuda/base/config.hpp @@ -8,7 +8,6 @@ #include - #include "cuda/base/math.hpp" diff --git a/cuda/base/cublas_bindings.hpp b/cuda/base/cublas_bindings.hpp index c1cdf1f996e..bc8da5851d5 100644 --- a/cuda/base/cublas_bindings.hpp +++ b/cuda/base/cublas_bindings.hpp @@ -8,10 +8,8 @@ #include - #include - #include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" diff --git a/cuda/base/curand_bindings.hpp b/cuda/base/curand_bindings.hpp index 10e09f4a356..8d31ac2e90e 100644 --- a/cuda/base/curand_bindings.hpp +++ b/cuda/base/curand_bindings.hpp @@ -8,10 +8,8 @@ #include - #include - #include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp index c18e1d7e9a6..bca0a80a37b 100644 --- a/cuda/base/cusparse_bindings.hpp +++ b/cuda/base/cusparse_bindings.hpp @@ -9,10 +9,8 @@ #include #include - #include - #include "common/cuda_hip/base/types.hpp" diff --git a/cuda/base/cusparse_block_bindings.hpp b/cuda/base/cusparse_block_bindings.hpp index c3db763f0da..484401460ec 100644 --- a/cuda/base/cusparse_block_bindings.hpp +++ b/cuda/base/cusparse_block_bindings.hpp @@ -9,10 +9,8 @@ #include #include - #include - #include "common/cuda_hip/base/types.hpp" #include "cuda/base/cusparse_bindings.hpp" diff --git a/cuda/base/cusparse_handle.hpp b/cuda/base/cusparse_handle.hpp index 118aa976bab..39f1876a275 100644 --- a/cuda/base/cusparse_handle.hpp +++ b/cuda/base/cusparse_handle.hpp @@ -9,7 +9,6 @@ #include #include - #include diff --git a/cuda/base/device.cpp b/cuda/base/device.cpp index d7a9808ab2d..eb6f25695ad 100644 --- a/cuda/base/device.cpp +++ b/cuda/base/device.cpp @@ -2,13 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "cuda/base/device.hpp" +#include #include - -#include "cuda/base/device.hpp" #include "cuda/base/scoped_device_id.hpp" diff --git a/cuda/base/device_matrix_data_kernels.cu b/cuda/base/device_matrix_data_kernels.cu index 554abe8bc37..678c121016c 100644 --- a/cuda/base/device_matrix_data_kernels.cu +++ b/cuda/base/device_matrix_data_kernels.cu @@ -4,7 +4,6 @@ #include "core/base/device_matrix_data_kernels.hpp" - #include #include #include @@ -13,7 +12,6 @@ #include #include - #include "common/cuda_hip/base/types.hpp" #include "cuda/base/thrust.cuh" diff --git a/cuda/base/exception.cpp b/cuda/base/exception.cpp index 13557e3da50..7bb7fae5bd5 100644 --- a/cuda/base/exception.cpp +++ b/cuda/base/exception.cpp @@ -4,17 +4,14 @@ #include "ginkgo/core/base/exception.hpp" - #include - -#include #include +#include #include #include #include - #include diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index c41bc6a72c6..1b1410ca8bb 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -4,22 +4,18 @@ #include "ginkgo/core/base/executor.hpp" - #include #include #include - #include - #include #include #include #include #include - #include "common/cuda_hip/base/config.hpp" #include "cuda/base/cublas_bindings.hpp" #include "cuda/base/cusparse_handle.hpp" diff --git a/cuda/base/index_set_kernels.cpp b/cuda/base/index_set_kernels.cpp index 8655836a414..2041833e4c2 100644 --- a/cuda/base/index_set_kernels.cpp +++ b/cuda/base/index_set_kernels.cpp @@ -4,10 +4,8 @@ #include "core/base/index_set_kernels.hpp" - #include - #include #include #include diff --git a/cuda/base/kernel_config.hpp b/cuda/base/kernel_config.hpp index f077290b4c5..f0821a42976 100644 --- a/cuda/base/kernel_config.hpp +++ b/cuda/base/kernel_config.hpp @@ -8,7 +8,6 @@ #include - #include diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh index 0d4bc4eebd5..4b1d5ac05c3 100644 --- a/cuda/base/kernel_launch.cuh +++ b/cuda/base/kernel_launch.cuh @@ -10,7 +10,6 @@ #include - #include "accessor/cuda_hip_helper.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/cuda/base/math.hpp b/cuda/base/math.hpp index d86a85a083e..d9fa5165cf6 100644 --- a/cuda/base/math.hpp +++ b/cuda/base/math.hpp @@ -6,11 +6,10 @@ #define GKO_CUDA_BASE_MATH_HPP_ -#include - - #include +#include + namespace gko { diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp index 7949b07f78f..e9c1658907c 100644 --- a/cuda/base/memory.cpp +++ b/cuda/base/memory.cpp @@ -4,14 +4,11 @@ #include "ginkgo/core/base/memory.hpp" - #include #include - #include - #include "cuda/base/scoped_device_id.hpp" diff --git a/cuda/base/nvtx.cpp b/cuda/base/nvtx.cpp index 6daa8afc2ca..e456cde5be0 100644 --- a/cuda/base/nvtx.cpp +++ b/cuda/base/nvtx.cpp @@ -4,7 +4,6 @@ #include - #include diff --git a/cuda/base/pointer_mode_guard.hpp b/cuda/base/pointer_mode_guard.hpp index 03327fb4dfe..39af6100c46 100644 --- a/cuda/base/pointer_mode_guard.hpp +++ b/cuda/base/pointer_mode_guard.hpp @@ -8,12 +8,10 @@ #include - -#include #include +#include #include - #include #include diff --git a/cuda/base/scoped_device_id.cpp b/cuda/base/scoped_device_id.cpp index a10e8d8913b..5851a1fe16b 100644 --- a/cuda/base/scoped_device_id.cpp +++ b/cuda/base/scoped_device_id.cpp @@ -2,19 +2,16 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "cuda/base/scoped_device_id.hpp" + #include #include - #include - #include -#include "cuda/base/scoped_device_id.hpp" - - namespace gko { namespace detail { diff --git a/cuda/base/stream.cpp b/cuda/base/stream.cpp index 703c9958ecd..c6f846c3f68 100644 --- a/cuda/base/stream.cpp +++ b/cuda/base/stream.cpp @@ -4,13 +4,10 @@ #include "ginkgo/core/base/stream.hpp" - #include - #include - #include "cuda/base/scoped_device_id.hpp" diff --git a/cuda/base/thrust.cuh b/cuda/base/thrust.cuh index 35e858a2555..5d5d58e0f33 100644 --- a/cuda/base/thrust.cuh +++ b/cuda/base/thrust.cuh @@ -9,7 +9,6 @@ #include #include - #include diff --git a/cuda/base/timer.cpp b/cuda/base/timer.cpp index 01b96c19536..f9559bffb95 100644 --- a/cuda/base/timer.cpp +++ b/cuda/base/timer.cpp @@ -4,14 +4,11 @@ #include "ginkgo/core/base/timer.hpp" - #include #include - #include - #include "cuda/base/scoped_device_id.hpp" diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 561612f2869..7252f7d673d 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -6,20 +6,16 @@ #define GKO_CUDA_BASE_TYPES_HPP_ -#include - - #include - -#include #include +#include #include #include #include - #include +#include namespace gko { diff --git a/cuda/components/atomic.cuh b/cuda/components/atomic.cuh index 1964f0ae196..a9d63677267 100644 --- a/cuda/components/atomic.cuh +++ b/cuda/components/atomic.cuh @@ -8,7 +8,6 @@ #include - #include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh index 70643a3b16a..c4ceca9e409 100644 --- a/cuda/components/cooperative_groups.cuh +++ b/cuda/components/cooperative_groups.cuh @@ -8,10 +8,8 @@ #include - -#include #include - +#include #include "common/cuda_hip/base/config.hpp" diff --git a/cuda/components/diagonal_block_manipulation.cuh b/cuda/components/diagonal_block_manipulation.cuh index a8f27d3a81f..7f19555ace5 100644 --- a/cuda/components/diagonal_block_manipulation.cuh +++ b/cuda/components/diagonal_block_manipulation.cuh @@ -8,7 +8,6 @@ #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" diff --git a/cuda/components/format_conversion.cuh b/cuda/components/format_conversion.cuh index f0ef007c53c..6690368cc4f 100644 --- a/cuda/components/format_conversion.cuh +++ b/cuda/components/format_conversion.cuh @@ -9,7 +9,6 @@ #include #include - #include "common/cuda_hip/components/cooperative_groups.hpp" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh index 97e5d67c23a..7dd0ba13ba4 100644 --- a/cuda/components/memory.cuh +++ b/cuda/components/memory.cuh @@ -8,10 +8,8 @@ #include - #include - #include "common/cuda_hip/base/types.hpp" diff --git a/cuda/components/prefix_sum.cuh b/cuda/components/prefix_sum.cuh index 2f6f145e304..6693bbfc326 100644 --- a/cuda/components/prefix_sum.cuh +++ b/cuda/components/prefix_sum.cuh @@ -8,7 +8,6 @@ #include - #include "common/cuda_hip/base/blas_bindings.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "cuda/components/reduction.cuh" diff --git a/cuda/components/prefix_sum_kernels.cu b/cuda/components/prefix_sum_kernels.cu index d330ce0a2b0..60b406ff894 100644 --- a/cuda/components/prefix_sum_kernels.cu +++ b/cuda/components/prefix_sum_kernels.cu @@ -4,18 +4,14 @@ #include "core/components/prefix_sum_kernels.hpp" - #include - #include - #include #include #include - #include "cuda/base/thrust.cuh" diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh index 250c560d44b..1e4b7cb447c 100644 --- a/cuda/components/reduction.cuh +++ b/cuda/components/reduction.cuh @@ -8,11 +8,9 @@ #include - #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/cuda/components/syncfree.cuh b/cuda/components/syncfree.cuh index 0d5c0d11f43..7d519891065 100644 --- a/cuda/components/syncfree.cuh +++ b/cuda/components/syncfree.cuh @@ -8,7 +8,6 @@ #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/cuda_hip/components/memory.hpp" diff --git a/cuda/components/warp_blas.cuh b/cuda/components/warp_blas.cuh index fa5e3d3ae3b..8e0042cfdad 100644 --- a/cuda/components/warp_blas.cuh +++ b/cuda/components/warp_blas.cuh @@ -9,10 +9,8 @@ #include #include - #include - #include "cuda/base/math.hpp" #include "cuda/components/reduction.cuh" diff --git a/cuda/distributed/index_map_kernels.cu b/cuda/distributed/index_map_kernels.cu index a5d838e901f..42e8f118301 100644 --- a/cuda/distributed/index_map_kernels.cu +++ b/cuda/distributed/index_map_kernels.cu @@ -4,7 +4,6 @@ #include "core/distributed/index_map_kernels.hpp" - #include #include #include @@ -18,10 +17,8 @@ #include #include - #include - #include "cuda/base/thrust.cuh" #include "cuda/components/atomic.cuh" #include "cuda/components/searching.cuh" diff --git a/cuda/distributed/matrix_kernels.cu b/cuda/distributed/matrix_kernels.cu index 3ad815d7090..1cb939d40e7 100644 --- a/cuda/distributed/matrix_kernels.cu +++ b/cuda/distributed/matrix_kernels.cu @@ -4,7 +4,6 @@ #include "core/distributed/matrix_kernels.hpp" - #include #include #include @@ -17,10 +16,8 @@ #include #include - #include - #include "cuda/base/thrust.cuh" #include "cuda/components/atomic.cuh" diff --git a/cuda/distributed/partition_helpers_kernels.cu b/cuda/distributed/partition_helpers_kernels.cu index b478477ce18..738d478d99a 100644 --- a/cuda/distributed/partition_helpers_kernels.cu +++ b/cuda/distributed/partition_helpers_kernels.cu @@ -4,13 +4,11 @@ #include "core/distributed/partition_helpers_kernels.hpp" - #include #include #include #include - #include "cuda/base/thrust.cuh" diff --git a/cuda/distributed/partition_kernels.cu b/cuda/distributed/partition_kernels.cu index de6c5bc6c02..050d6d285d6 100644 --- a/cuda/distributed/partition_kernels.cu +++ b/cuda/distributed/partition_kernels.cu @@ -4,7 +4,6 @@ #include "core/distributed/partition_kernels.hpp" - #include #include #include @@ -12,7 +11,6 @@ #include #include - #include "common/unified/base/kernel_launch.hpp" #include "core/components/fill_array_kernels.hpp" #include "cuda/base/thrust.cuh" diff --git a/cuda/distributed/vector_kernels.cu b/cuda/distributed/vector_kernels.cu index ca9c419239b..60388150da4 100644 --- a/cuda/distributed/vector_kernels.cu +++ b/cuda/distributed/vector_kernels.cu @@ -4,7 +4,6 @@ #include "core/distributed/vector_kernels.hpp" - #include #include #include @@ -12,10 +11,8 @@ #include #include - #include - #include "cuda/base/thrust.cuh" diff --git a/cuda/factorization/cholesky_kernels.cu b/cuda/factorization/cholesky_kernels.cu index e05b0803dc2..7d5fe2c3d08 100644 --- a/cuda/factorization/cholesky_kernels.cu +++ b/cuda/factorization/cholesky_kernels.cu @@ -4,11 +4,9 @@ #include "core/factorization/cholesky_kernels.hpp" - #include #include - #include #include #include @@ -16,10 +14,8 @@ #include #include - #include - #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/fill_array_kernels.hpp" diff --git a/cuda/factorization/factorization_kernels.cu b/cuda/factorization/factorization_kernels.cu index 309ded37d34..fcabf3676e6 100644 --- a/cuda/factorization/factorization_kernels.cu +++ b/cuda/factorization/factorization_kernels.cu @@ -4,10 +4,8 @@ #include "core/factorization/factorization_kernels.hpp" - #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/cuda/factorization/ic_kernels.cu b/cuda/factorization/ic_kernels.cu index 9d55856f139..3a4b4a55411 100644 --- a/cuda/factorization/ic_kernels.cu +++ b/cuda/factorization/ic_kernels.cu @@ -4,10 +4,8 @@ #include "core/factorization/ic_kernels.hpp" - #include - #include "common/cuda_hip/base/sparselib_bindings.hpp" diff --git a/cuda/factorization/ilu_kernels.cu b/cuda/factorization/ilu_kernels.cu index acebec6e94c..6096e89ef4b 100644 --- a/cuda/factorization/ilu_kernels.cu +++ b/cuda/factorization/ilu_kernels.cu @@ -4,10 +4,8 @@ #include "core/factorization/ilu_kernels.hpp" - #include - #include "common/cuda_hip/base/sparselib_bindings.hpp" diff --git a/cuda/factorization/lu_kernels.cu b/cuda/factorization/lu_kernels.cu index 9c3069f62cf..57ed7ac8531 100644 --- a/cuda/factorization/lu_kernels.cu +++ b/cuda/factorization/lu_kernels.cu @@ -4,19 +4,15 @@ #include "core/factorization/lu_kernels.hpp" - #include #include - #include #include #include - #include - #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/allocator.hpp" diff --git a/cuda/factorization/par_ic_kernels.cu b/cuda/factorization/par_ic_kernels.cu index f493cb11fd1..473272fe1fb 100644 --- a/cuda/factorization/par_ic_kernels.cu +++ b/cuda/factorization/par_ic_kernels.cu @@ -4,12 +4,10 @@ #include "core/factorization/par_ic_kernels.hpp" - #include #include #include - #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/memory.hpp" #include "cuda/base/math.hpp" diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu index d958f81d2f4..fb7a0b0370a 100644 --- a/cuda/factorization/par_ict_kernels.cu +++ b/cuda/factorization/par_ict_kernels.cu @@ -4,14 +4,12 @@ #include "core/factorization/par_ict_kernels.hpp" - #include #include #include #include #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/components/memory.hpp" #include "core/components/prefix_sum_kernels.hpp" diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu index 755723e7d4c..1f023892afb 100644 --- a/cuda/factorization/par_ilu_kernels.cu +++ b/cuda/factorization/par_ilu_kernels.cu @@ -4,10 +4,8 @@ #include "core/factorization/par_ilu_kernels.hpp" - #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/memory.hpp" diff --git a/cuda/factorization/par_ilut_approx_filter_kernels.cu b/cuda/factorization/par_ilut_approx_filter_kernels.cu index ae544939e17..51127ffd43b 100644 --- a/cuda/factorization/par_ilut_approx_filter_kernels.cu +++ b/cuda/factorization/par_ilut_approx_filter_kernels.cu @@ -2,23 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - - #include - #include #include #include #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/cuda/factorization/par_ilut_filter_kernels.cu b/cuda/factorization/par_ilut_filter_kernels.cu index 4a24c5f305b..e15c7ec4cf6 100644 --- a/cuda/factorization/par_ilut_filter_kernels.cu +++ b/cuda/factorization/par_ilut_filter_kernels.cu @@ -2,21 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - - #include #include #include #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/cuda/factorization/par_ilut_select_common.cu b/cuda/factorization/par_ilut_select_common.cu index bbba93595c8..3f910f4884e 100644 --- a/cuda/factorization/par_ilut_select_common.cu +++ b/cuda/factorization/par_ilut_select_common.cu @@ -4,7 +4,6 @@ #include "cuda/factorization/par_ilut_select_common.cuh" - #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" #include "cuda/base/math.hpp" diff --git a/cuda/factorization/par_ilut_select_kernels.cu b/cuda/factorization/par_ilut_select_kernels.cu index 6a7bd53c1c4..ac37e3a7595 100644 --- a/cuda/factorization/par_ilut_select_kernels.cu +++ b/cuda/factorization/par_ilut_select_kernels.cu @@ -2,19 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - - #include - #include #include #include - #include "common/cuda_hip/base/runtime.hpp" #include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" #include "cuda/base/math.hpp" #include "cuda/components/atomic.cuh" #include "cuda/components/intrinsics.cuh" diff --git a/cuda/factorization/par_ilut_spgeam_kernels.cu b/cuda/factorization/par_ilut_spgeam_kernels.cu index 0a751c2f48f..83ec9c974b8 100644 --- a/cuda/factorization/par_ilut_spgeam_kernels.cu +++ b/cuda/factorization/par_ilut_spgeam_kernels.cu @@ -2,19 +2,16 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - - #include #include #include #include #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/cuda/factorization/par_ilut_sweep_kernels.cu b/cuda/factorization/par_ilut_sweep_kernels.cu index 5924ebe328d..8bdf6c9380a 100644 --- a/cuda/factorization/par_ilut_sweep_kernels.cu +++ b/cuda/factorization/par_ilut_sweep_kernels.cu @@ -2,19 +2,16 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - - #include #include #include #include #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/components/memory.hpp" #include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu index 6ec20480405..4fc5137646c 100644 --- a/cuda/matrix/batch_csr_kernels.cu +++ b/cuda/matrix/batch_csr_kernels.cu @@ -4,15 +4,12 @@ #include "core/matrix/batch_csr_kernels.hpp" - #include - #include #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu index 673b08e5db1..e28d4f91670 100644 --- a/cuda/matrix/batch_dense_kernels.cu +++ b/cuda/matrix/batch_dense_kernels.cu @@ -4,15 +4,12 @@ #include "core/matrix/batch_dense_kernels.hpp" - #include - #include #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu index 8f0160bd154..90caf963200 100644 --- a/cuda/matrix/batch_ell_kernels.cu +++ b/cuda/matrix/batch_ell_kernels.cu @@ -4,15 +4,12 @@ #include "core/matrix/batch_ell_kernels.hpp" - #include - #include #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp index 5e9c803c9f6..5845fb2235e 100644 --- a/cuda/matrix/batch_struct.hpp +++ b/cuda/matrix/batch_struct.hpp @@ -6,15 +6,12 @@ #define GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ -#include "core/matrix/batch_struct.hpp" - - #include #include - #include "common/cuda_hip/base/types.hpp" #include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" namespace gko { diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu index f138d0b934e..1536e88345e 100644 --- a/cuda/matrix/coo_kernels.cu +++ b/cuda/matrix/coo_kernels.cu @@ -4,14 +4,12 @@ #include "core/matrix/coo_kernels.hpp" - #include #include #include #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu index a0a7e4e97b8..600f4ffb5a3 100644 --- a/cuda/matrix/csr_kernels.template.cu +++ b/cuda/matrix/csr_kernels.template.cu @@ -4,10 +4,8 @@ #include "core/matrix/csr_kernels.hpp" - #include - #include #include #include @@ -16,7 +14,6 @@ #include #include - #include #include #include @@ -26,7 +23,6 @@ #include #include - #include "accessor/cuda_hip_helper.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/pointer_mode_guard.hpp" diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu index b117c39107b..b2114f936e7 100644 --- a/cuda/matrix/dense_kernels.cu +++ b/cuda/matrix/dense_kernels.cu @@ -4,7 +4,6 @@ #include "core/matrix/dense_kernels.hpp" - #include #include #include @@ -16,7 +15,6 @@ #include #include - #include "common/cuda_hip/base/blas_bindings.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/pointer_mode_guard.hpp" diff --git a/cuda/matrix/diagonal_kernels.cu b/cuda/matrix/diagonal_kernels.cu index e362ff0462b..78c0babe3a0 100644 --- a/cuda/matrix/diagonal_kernels.cu +++ b/cuda/matrix/diagonal_kernels.cu @@ -4,11 +4,9 @@ #include "core/matrix/diagonal_kernels.hpp" - #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index 105122ec4a9..5c81fa7c994 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -4,17 +4,14 @@ #include "core/matrix/ell_kernels.hpp" - #include - #include #include #include #include #include - #include "accessor/cuda_hip_helper.hpp" #include "accessor/reduced_row_major.hpp" #include "common/cuda_hip/base/config.hpp" diff --git a/cuda/matrix/fbcsr_kernels.template.cu b/cuda/matrix/fbcsr_kernels.template.cu index ad36c84216e..120a81c247c 100644 --- a/cuda/matrix/fbcsr_kernels.template.cu +++ b/cuda/matrix/fbcsr_kernels.template.cu @@ -4,10 +4,8 @@ #include "core/matrix/fbcsr_kernels.hpp" - #include - #include #include #include @@ -16,14 +14,12 @@ #include #include - #include #include #include #include #include - #include "common/cuda_hip/base/blas_bindings.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/pointer_mode_guard.hpp" diff --git a/cuda/matrix/fft_kernels.cu b/cuda/matrix/fft_kernels.cu index d02f1c63c70..80e938fbbff 100644 --- a/cuda/matrix/fft_kernels.cu +++ b/cuda/matrix/fft_kernels.cu @@ -4,13 +4,10 @@ #include "core/matrix/fft_kernels.hpp" - #include - #include - #include #include #include diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu index d6c20075ef4..07f5d5d8ec0 100644 --- a/cuda/matrix/sellp_kernels.cu +++ b/cuda/matrix/sellp_kernels.cu @@ -4,14 +4,12 @@ #include "core/matrix/sellp_kernels.hpp" - #include #include #include #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu index 311e4d3782c..17a1e004935 100644 --- a/cuda/matrix/sparsity_csr_kernels.cu +++ b/cuda/matrix/sparsity_csr_kernels.cu @@ -4,13 +4,10 @@ #include "core/matrix/sparsity_csr_kernels.hpp" - #include - #include - #include "accessor/cuda_hip_helper.hpp" #include "accessor/reduced_row_major.hpp" #include "common/cuda_hip/base/config.hpp" diff --git a/cuda/multigrid/pgm_kernels.cu b/cuda/multigrid/pgm_kernels.cu index 75c3dd911ad..399d8a06c1b 100644 --- a/cuda/multigrid/pgm_kernels.cu +++ b/cuda/multigrid/pgm_kernels.cu @@ -4,21 +4,17 @@ #include "core/multigrid/pgm_kernels.hpp" - #include - #include #include #include #include #include - #include #include - #include "common/cuda_hip/base/types.hpp" #include "cuda/base/thrust.cuh" diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu index 67c41634637..1bc39df9781 100644 --- a/cuda/preconditioner/batch_jacobi_kernels.cu +++ b/cuda/preconditioner/batch_jacobi_kernels.cu @@ -4,12 +4,10 @@ #include "core/preconditioner/batch_jacobi_kernels.hpp" - #include #include #include - #include "core/base/batch_struct.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" diff --git a/cuda/preconditioner/isai_kernels.cu b/cuda/preconditioner/isai_kernels.cu index d0dd516466a..8867bf643b0 100644 --- a/cuda/preconditioner/isai_kernels.cu +++ b/cuda/preconditioner/isai_kernels.cu @@ -4,12 +4,10 @@ #include "core/preconditioner/isai_kernels.hpp" - #include #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.cu index 6150ea5b12d..74c7dea9b6b 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_kernels.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.cu @@ -2,13 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - #include - #include "core/matrix/dense_kernels.hpp" +#include "core/preconditioner/jacobi_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/preconditioner/jacobi_common.hpp" diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu index 10ede90da7e..e0b9145a0f7 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu @@ -2,17 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" +#include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/math.hpp" diff --git a/cuda/preconditioner/jacobi_generate_kernels.cu b/cuda/preconditioner/jacobi_generate_kernels.cu index f1e8320611b..651dcec611a 100644 --- a/cuda/preconditioner/jacobi_generate_kernels.cu +++ b/cuda/preconditioner/jacobi_generate_kernels.cu @@ -2,14 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - #include #include - #include "core/components/fill_array_kernels.hpp" +#include "core/preconditioner/jacobi_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/preconditioner/jacobi_common.hpp" diff --git a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu index 129c50625f4..c12df449e42 100644 --- a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu +++ b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu @@ -2,18 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/components/fill_array_kernels.hpp" +#include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/math.hpp" diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu index bce2ff23303..783de652733 100644 --- a/cuda/preconditioner/jacobi_kernels.cu +++ b/cuda/preconditioner/jacobi_kernels.cu @@ -4,10 +4,8 @@ #include "core/preconditioner/jacobi_kernels.hpp" - #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.cu index d510aab6963..5cac209b8b2 100644 --- a/cuda/preconditioner/jacobi_simple_apply_kernels.cu +++ b/cuda/preconditioner/jacobi_simple_apply_kernels.cu @@ -2,12 +2,9 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - #include - +#include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/preconditioner/jacobi_common.hpp" diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu index 15f6dc138ad..45af2ec668f 100644 --- a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu +++ b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu @@ -2,17 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" +#include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/math.hpp" diff --git a/cuda/reorder/rcm_kernels.cu b/cuda/reorder/rcm_kernels.cu index 72322016fba..8308cf88e60 100644 --- a/cuda/reorder/rcm_kernels.cu +++ b/cuda/reorder/rcm_kernels.cu @@ -4,7 +4,6 @@ #include "core/reorder/rcm_kernels.hpp" - #include #include #include @@ -16,7 +15,6 @@ #include #include - #include #include #include @@ -24,7 +22,6 @@ #include #include - #include "common/cuda_hip/components/memory.hpp" #include "core/base/array_access.hpp" #include "cuda/base/thrust.cuh" diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 58e1a6b7b0d..b6ae74a5064 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -4,15 +4,12 @@ #include "core/solver/batch_bicgstab_kernels.hpp" - #include #include - #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index 398e831eb09..5425bd9cd9c 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -4,15 +4,12 @@ #include "core/solver/batch_cg_kernels.hpp" - #include #include - #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu index 3dbefadf22a..8b1a28d5581 100644 --- a/cuda/solver/cb_gmres_kernels.cu +++ b/cuda/solver/cb_gmres_kernels.cu @@ -4,16 +4,13 @@ #include "core/solver/cb_gmres_kernels.hpp" - #include - #include #include #include #include - #include "accessor/cuda_hip_helper.hpp" #include "accessor/range.hpp" #include "accessor/reduced_row_major.hpp" diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 992974e95ef..a205f155487 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -10,16 +10,13 @@ #include #include - #include #include - #include #include #include - #include "common/cuda_hip/base/pointer_mode_guard.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/cuda/solver/idr_kernels.cu b/cuda/solver/idr_kernels.cu index f7e89c9d9d8..34aac3751d6 100644 --- a/cuda/solver/idr_kernels.cu +++ b/cuda/solver/idr_kernels.cu @@ -4,15 +4,12 @@ #include "core/solver/idr_kernels.hpp" - #include #include - #include #include - #include "common/cuda_hip/base/blas_bindings.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/randlib_bindings.hpp" diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu index 002cc0140cb..898ffb92552 100644 --- a/cuda/solver/lower_trs_kernels.cu +++ b/cuda/solver/lower_trs_kernels.cu @@ -4,19 +4,15 @@ #include "core/solver/lower_trs_kernels.hpp" - #include - #include #include - #include #include #include - #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" diff --git a/cuda/solver/multigrid_kernels.cu b/cuda/solver/multigrid_kernels.cu index 1d31130623a..6001d42614d 100644 --- a/cuda/solver/multigrid_kernels.cu +++ b/cuda/solver/multigrid_kernels.cu @@ -4,13 +4,11 @@ #include "core/solver/multigrid_kernels.hpp" - #include #include #include #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "core/base/array_access.hpp" diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu index e1e01538f79..b1f9e43ed2c 100644 --- a/cuda/solver/upper_trs_kernels.cu +++ b/cuda/solver/upper_trs_kernels.cu @@ -4,19 +4,15 @@ #include "core/solver/upper_trs_kernels.hpp" - #include - #include #include - #include #include #include - #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" diff --git a/cuda/stop/criterion_kernels.cu b/cuda/stop/criterion_kernels.cu index e54b5d140f2..20538e87304 100644 --- a/cuda/stop/criterion_kernels.cu +++ b/cuda/stop/criterion_kernels.cu @@ -4,12 +4,10 @@ #include "core/stop/criterion_kernels.hpp" - #include #include #include - #include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/stop/residual_norm_kernels.cu b/cuda/stop/residual_norm_kernels.cu index 7146d0cbf04..d59f937b918 100644 --- a/cuda/stop/residual_norm_kernels.cu +++ b/cuda/stop/residual_norm_kernels.cu @@ -4,12 +4,10 @@ #include "core/stop/residual_norm_kernels.hpp" - #include #include #include - #include "common/cuda_hip/base/types.hpp" #include "core/base/array_access.hpp" #include "cuda/base/math.hpp" diff --git a/cuda/test/base/array.cpp b/cuda/test/base/array.cpp index 6e63b13ff7c..edb6b71676a 100644 --- a/cuda/test/base/array.cpp +++ b/cuda/test/base/array.cpp @@ -2,15 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - +#include #include - #include "core/base/array_access.hpp" #include "cuda/test/utils.hpp" diff --git a/cuda/test/base/cuda_executor.cu b/cuda/test/base/cuda_executor.cu index 012b5017dc3..8eb3dbd19fe 100644 --- a/cuda/test/base/cuda_executor.cu +++ b/cuda/test/base/cuda_executor.cu @@ -2,18 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include - #include - #include #include +#include #include #include "common/cuda_hip/base/executor.hpp.inc" diff --git a/cuda/test/base/cuda_executor_topology.cu b/cuda/test/base/cuda_executor_topology.cu index 2a6d5e9b528..790fc0be1f1 100644 --- a/cuda/test/base/cuda_executor_topology.cu +++ b/cuda/test/base/cuda_executor_topology.cu @@ -2,13 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include +#include + #if defined(__unix__) || defined(__APPLE__) #include @@ -18,11 +17,9 @@ #include - #include #include - #include "cuda/test/utils.hpp" diff --git a/cuda/test/base/exception_helpers.cu b/cuda/test/base/exception_helpers.cu index 7e85601328a..7ee7ca0e8f0 100644 --- a/cuda/test/base/exception_helpers.cu +++ b/cuda/test/base/exception_helpers.cu @@ -2,18 +2,16 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - -#include #include +#include #include #include #include - #include +#include + namespace { diff --git a/cuda/test/base/index_set.cpp b/cuda/test/base/index_set.cpp index 797bc3f1f44..0e75f3dd140 100644 --- a/cuda/test/base/index_set.cpp +++ b/cuda/test/base/index_set.cpp @@ -2,20 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include +#include #include - #include "cuda/test/utils.hpp" diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu index da52b3ffc87..ddf14f7baf9 100644 --- a/cuda/test/base/kernel_launch.cu +++ b/cuda/test/base/kernel_launch.cu @@ -4,20 +4,16 @@ #include "common/unified/base/kernel_launch.hpp" - #include #include - #include - #include #include #include #include - #include "common/unified/base/kernel_launch_reduction.hpp" #include "common/unified/base/kernel_launch_solver.hpp" #include "core/base/array_access.hpp" diff --git a/cuda/test/base/lin_op.cpp b/cuda/test/base/lin_op.cpp index dd703ec07fa..87cd8ee32bc 100644 --- a/cuda/test/base/lin_op.cpp +++ b/cuda/test/base/lin_op.cpp @@ -4,7 +4,6 @@ #include - #include "cuda/test/utils.hpp" diff --git a/cuda/test/base/math.cu b/cuda/test/base/math.cu index 944e7642223..e3c1d78ed39 100644 --- a/cuda/test/base/math.cu +++ b/cuda/test/base/math.cu @@ -2,23 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "cuda/base/math.hpp" #include #include #include - #include - #include #include - +#include #include "common/cuda_hip/base/types.hpp" -#include "cuda/base/math.hpp" #include "cuda/test/utils.hpp" diff --git a/cuda/test/base/memory.cpp b/cuda/test/base/memory.cpp index f1657639ff0..345616c0588 100644 --- a/cuda/test/base/memory.cpp +++ b/cuda/test/base/memory.cpp @@ -2,20 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include - #include - #include #include #include - +#include #include "cuda/test/utils.hpp" diff --git a/cuda/test/base/scoped_device_id.cu b/cuda/test/base/scoped_device_id.cu index 5c2e496b64b..0ac4b21e207 100644 --- a/cuda/test/base/scoped_device_id.cu +++ b/cuda/test/base/scoped_device_id.cu @@ -2,16 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "cuda/base/scoped_device_id.hpp" +#include #include - #include - -#include "cuda/base/scoped_device_id.hpp" #include "cuda/test/utils.hpp" diff --git a/cuda/test/components/cooperative_groups.cu b/cuda/test/components/cooperative_groups.cu index c9d9e6bf124..df3cef86bb8 100644 --- a/cuda/test/components/cooperative_groups.cu +++ b/cuda/test/components/cooperative_groups.cu @@ -2,18 +2,16 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include #include - #include #include - #include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" #include "cuda/test/utils.hpp" diff --git a/cuda/test/components/merging.cu b/cuda/test/components/merging.cu index 37b032eb794..2788767b078 100644 --- a/cuda/test/components/merging.cu +++ b/cuda/test/components/merging.cu @@ -4,20 +4,16 @@ #include "cuda/components/merging.cuh" - #include #include #include #include - #include - #include #include - #include "common/cuda_hip/components/cooperative_groups.hpp" #include "cuda/test/utils.hpp" diff --git a/cuda/test/components/searching.cu b/cuda/test/components/searching.cu index ffe00c247c0..afe7fb4b442 100644 --- a/cuda/test/components/searching.cu +++ b/cuda/test/components/searching.cu @@ -4,19 +4,15 @@ #include "cuda/components/searching.cuh" - #include #include #include - #include - #include #include - #include "common/cuda_hip/components/cooperative_groups.hpp" #include "cuda/test/utils.hpp" diff --git a/cuda/test/components/sorting.cu b/cuda/test/components/sorting.cu index 19c7daab782..e1524ce0078 100644 --- a/cuda/test/components/sorting.cu +++ b/cuda/test/components/sorting.cu @@ -4,18 +4,14 @@ #include "cuda/components/sorting.cuh" - #include #include - #include - #include #include - #include "cuda/test/utils.hpp" diff --git a/cuda/test/solver/lower_trs_kernels.cu b/cuda/test/solver/lower_trs_kernels.cu index 00d4f371ac5..67eeaf51847 100644 --- a/cuda/test/solver/lower_trs_kernels.cu +++ b/cuda/test/solver/lower_trs_kernels.cu @@ -2,24 +2,21 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/solver/lower_trs_kernels.hpp" + #include #include - #include - #include - #include #include #include #include #include - -#include "core/solver/lower_trs_kernels.hpp" #include "cuda/test/utils.hpp" diff --git a/cuda/test/solver/upper_trs_kernels.cu b/cuda/test/solver/upper_trs_kernels.cu index de2368be6ab..3ad061e2bc5 100644 --- a/cuda/test/solver/upper_trs_kernels.cu +++ b/cuda/test/solver/upper_trs_kernels.cu @@ -2,24 +2,21 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/solver/upper_trs_kernels.hpp" + #include #include - #include - #include - #include #include #include #include #include - -#include "core/solver/upper_trs_kernels.hpp" #include "cuda/test/utils.hpp" diff --git a/cuda/test/utils.hpp b/cuda/test/utils.hpp index a398deedd7b..17dd1fd8722 100644 --- a/cuda/test/utils.hpp +++ b/cuda/test/utils.hpp @@ -6,14 +6,11 @@ #define GKO_CUDA_TEST_UTILS_HPP_ -#include "core/test/utils.hpp" - - #include #include - #include "core/test/gtest/resources.hpp" +#include "core/test/utils.hpp" #include "cuda/base/device.hpp" diff --git a/cuda/test/utils/assertions_test.cu b/cuda/test/utils/assertions_test.cu index 482744a893b..65b4cdc75a7 100644 --- a/cuda/test/utils/assertions_test.cu +++ b/cuda/test/utils/assertions_test.cu @@ -4,14 +4,11 @@ #include "core/test/utils/assertions.hpp" - #include - #include #include - #include "cuda/test/utils.hpp" diff --git a/devices/device.cpp b/devices/device.cpp index 5a036f491c1..cac4be8aa7b 100644 --- a/devices/device.cpp +++ b/devices/device.cpp @@ -5,7 +5,6 @@ #include #include - #include diff --git a/devices/dpcpp/executor.cpp b/devices/dpcpp/executor.cpp index 435d9426374..aaca50d3931 100644 --- a/devices/dpcpp/executor.cpp +++ b/devices/dpcpp/executor.cpp @@ -4,11 +4,9 @@ #include "ginkgo/core/base/executor.hpp" - #include #include - #include #include diff --git a/devices/machine_topology.cpp b/devices/machine_topology.cpp index af881af6df4..406580ef7d1 100644 --- a/devices/machine_topology.cpp +++ b/devices/machine_topology.cpp @@ -6,7 +6,6 @@ #include #include - #include diff --git a/devices/omp/executor.cpp b/devices/omp/executor.cpp index 448d7b68d63..54b9c9c36be 100644 --- a/devices/omp/executor.cpp +++ b/devices/omp/executor.cpp @@ -4,11 +4,9 @@ #include "ginkgo/core/base/executor.hpp" - #include #include - #include #include diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp index bb84d945745..8f607725bc8 100644 --- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp +++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp @@ -4,19 +4,15 @@ #include "core/base/batch_multi_vector_kernels.hpp" - #include - #include - #include #include #include #include - #include "core/base/batch_struct.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "dpcpp/base/batch_struct.hpp" diff --git a/dpcpp/base/batch_struct.hpp b/dpcpp/base/batch_struct.hpp index e183e11dcf8..9c20a8574ef 100644 --- a/dpcpp/base/batch_struct.hpp +++ b/dpcpp/base/batch_struct.hpp @@ -9,7 +9,6 @@ #include #include - #include "core/base/batch_struct.hpp" #include "dpcpp/base/config.hpp" diff --git a/dpcpp/base/config.hpp b/dpcpp/base/config.hpp index 03a419bf260..12330c1b992 100644 --- a/dpcpp/base/config.hpp +++ b/dpcpp/base/config.hpp @@ -10,7 +10,6 @@ #include #include - #include "core/base/types.hpp" diff --git a/dpcpp/base/device_matrix_data_kernels.dp.cpp b/dpcpp/base/device_matrix_data_kernels.dp.cpp index a735470d5ba..f39615613fe 100644 --- a/dpcpp/base/device_matrix_data_kernels.dp.cpp +++ b/dpcpp/base/device_matrix_data_kernels.dp.cpp @@ -2,16 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause - #include - #include "core/base/device_matrix_data_kernels.hpp" - #include - #include "dpcpp/base/onedpl.hpp" diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp index 159ee7eb533..29f0810d9d9 100644 --- a/dpcpp/base/executor.dp.cpp +++ b/dpcpp/base/executor.dp.cpp @@ -4,17 +4,14 @@ #include "ginkgo/core/base/executor.hpp" - #include #include #include #include #include - #include - #include #include diff --git a/dpcpp/base/helper.dp.cpp b/dpcpp/base/helper.dp.cpp index 7e0f3f9ce8c..f4ae9f0560d 100644 --- a/dpcpp/base/helper.dp.cpp +++ b/dpcpp/base/helper.dp.cpp @@ -2,11 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include "dpcpp/base/helper.hpp" +#include + namespace gko { namespace kernels { diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp index 78b933a2e32..b8cf1a8451c 100644 --- a/dpcpp/base/helper.hpp +++ b/dpcpp/base/helper.hpp @@ -8,14 +8,11 @@ #include - #include - #include #include - #include "core/base/types.hpp" #include "dpcpp/base/dim3.dp.hpp" diff --git a/dpcpp/base/index_set_kernels.dp.cpp b/dpcpp/base/index_set_kernels.dp.cpp index abd4caaa482..8f6c46d2405 100644 --- a/dpcpp/base/index_set_kernels.dp.cpp +++ b/dpcpp/base/index_set_kernels.dp.cpp @@ -4,10 +4,8 @@ #include "core/base/index_set_kernels.hpp" - #include - #include #include #include diff --git a/dpcpp/base/kernel_launch.dp.hpp b/dpcpp/base/kernel_launch.dp.hpp index 38928adf531..7aa117692f7 100644 --- a/dpcpp/base/kernel_launch.dp.hpp +++ b/dpcpp/base/kernel_launch.dp.hpp @@ -10,7 +10,6 @@ #include - #include diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index a92eb89737f..83436966ecb 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -10,7 +10,6 @@ #include - #include "core/synthesizer/implementation_selection.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" diff --git a/dpcpp/base/onedpl.hpp b/dpcpp/base/onedpl.hpp index 8ea971f4602..213d4296700 100644 --- a/dpcpp/base/onedpl.hpp +++ b/dpcpp/base/onedpl.hpp @@ -8,7 +8,6 @@ #include - #include diff --git a/dpcpp/base/onemkl_bindings.hpp b/dpcpp/base/onemkl_bindings.hpp index 784c53b87bb..004c296553c 100644 --- a/dpcpp/base/onemkl_bindings.hpp +++ b/dpcpp/base/onemkl_bindings.hpp @@ -8,11 +8,9 @@ #include - #include #include - #include diff --git a/dpcpp/base/scoped_device_id.dp.cpp b/dpcpp/base/scoped_device_id.dp.cpp index 97c8b6714d4..161c5a26003 100644 --- a/dpcpp/base/scoped_device_id.dp.cpp +++ b/dpcpp/base/scoped_device_id.dp.cpp @@ -5,7 +5,6 @@ #include #include - #include "core/base/noop_scoped_device_id_guard.hpp" diff --git a/dpcpp/base/timer.dp.cpp b/dpcpp/base/timer.dp.cpp index da347b14ddf..ed21e1b79a5 100644 --- a/dpcpp/base/timer.dp.cpp +++ b/dpcpp/base/timer.dp.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/base/timer.hpp" - #include - #include diff --git a/dpcpp/components/atomic.dp.hpp b/dpcpp/components/atomic.dp.hpp index 3126e9a6ab1..8168421a488 100644 --- a/dpcpp/components/atomic.dp.hpp +++ b/dpcpp/components/atomic.dp.hpp @@ -8,10 +8,8 @@ #include - #include - #include "dpcpp/base/dpct.hpp" diff --git a/dpcpp/components/cooperative_groups.dp.hpp b/dpcpp/components/cooperative_groups.dp.hpp index 879f0c25d25..c758cf42710 100644 --- a/dpcpp/components/cooperative_groups.dp.hpp +++ b/dpcpp/components/cooperative_groups.dp.hpp @@ -8,10 +8,8 @@ #include - #include - #include "dpcpp/base/config.hpp" #include "dpcpp/base/dpct.hpp" diff --git a/dpcpp/components/diagonal_block_manipulation.dp.hpp b/dpcpp/components/diagonal_block_manipulation.dp.hpp index 3e19efebec2..626a225c4fa 100644 --- a/dpcpp/components/diagonal_block_manipulation.dp.hpp +++ b/dpcpp/components/diagonal_block_manipulation.dp.hpp @@ -8,10 +8,8 @@ #include - #include - #include "dpcpp/base/config.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" diff --git a/dpcpp/components/format_conversion.dp.hpp b/dpcpp/components/format_conversion.dp.hpp index a9a3ac408a3..17cf55389df 100644 --- a/dpcpp/components/format_conversion.dp.hpp +++ b/dpcpp/components/format_conversion.dp.hpp @@ -8,14 +8,11 @@ #include - #include - #include #include - #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" diff --git a/dpcpp/components/intrinsics.dp.hpp b/dpcpp/components/intrinsics.dp.hpp index 7230f5124b0..369a3dff8b9 100644 --- a/dpcpp/components/intrinsics.dp.hpp +++ b/dpcpp/components/intrinsics.dp.hpp @@ -8,10 +8,8 @@ #include - #include - #include "dpcpp/base/dpct.hpp" diff --git a/dpcpp/components/merging.dp.hpp b/dpcpp/components/merging.dp.hpp index f700364769e..8d2f96e70bf 100644 --- a/dpcpp/components/merging.dp.hpp +++ b/dpcpp/components/merging.dp.hpp @@ -8,10 +8,8 @@ #include - #include - #include "core/base/utils.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/components/intrinsics.dp.hpp" diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp index 18287b82fe7..b1ae9da32bb 100644 --- a/dpcpp/components/prefix_sum.dp.hpp +++ b/dpcpp/components/prefix_sum.dp.hpp @@ -8,10 +8,8 @@ #include - #include - #include "core/base/types.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" diff --git a/dpcpp/components/prefix_sum_kernels.dp.cpp b/dpcpp/components/prefix_sum_kernels.dp.cpp index c8a663e20d4..a47f45e9565 100644 --- a/dpcpp/components/prefix_sum_kernels.dp.cpp +++ b/dpcpp/components/prefix_sum_kernels.dp.cpp @@ -4,13 +4,10 @@ #include "core/components/prefix_sum_kernels.hpp" - #include - #include - #include "core/base/types.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/components/prefix_sum.dp.hpp" diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index 1bdaa7dbb10..aed8166d601 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -8,15 +8,12 @@ #include - #include - #include #include #include - #include "core/base/array_access.hpp" #include "core/base/types.hpp" #include "core/synthesizer/implementation_selection.hpp" diff --git a/dpcpp/components/searching.dp.hpp b/dpcpp/components/searching.dp.hpp index 903492599bc..b4cbd1bb726 100644 --- a/dpcpp/components/searching.dp.hpp +++ b/dpcpp/components/searching.dp.hpp @@ -8,7 +8,6 @@ #include - #include "dpcpp/base/config.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/components/intrinsics.dp.hpp" diff --git a/dpcpp/components/segment_scan.dp.hpp b/dpcpp/components/segment_scan.dp.hpp index 23b2f0a15d0..b6c26523f30 100644 --- a/dpcpp/components/segment_scan.dp.hpp +++ b/dpcpp/components/segment_scan.dp.hpp @@ -8,7 +8,6 @@ #include - #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" diff --git a/dpcpp/components/sorting.dp.hpp b/dpcpp/components/sorting.dp.hpp index 7b7ddacb221..e616903721c 100644 --- a/dpcpp/components/sorting.dp.hpp +++ b/dpcpp/components/sorting.dp.hpp @@ -8,7 +8,6 @@ #include - #include "dpcpp/base/config.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp index 5a1b98e4e05..09f7b24c6ee 100644 --- a/dpcpp/components/thread_ids.dp.hpp +++ b/dpcpp/components/thread_ids.dp.hpp @@ -8,7 +8,6 @@ #include - #include "dpcpp/base/config.hpp" #include "dpcpp/base/dpct.hpp" diff --git a/dpcpp/components/uninitialized_array.hpp b/dpcpp/components/uninitialized_array.hpp index 019a3cb6644..1d25cbf3837 100644 --- a/dpcpp/components/uninitialized_array.hpp +++ b/dpcpp/components/uninitialized_array.hpp @@ -8,7 +8,6 @@ #include - #include "dpcpp/base/dpct.hpp" diff --git a/dpcpp/components/warp_blas.dp.hpp b/dpcpp/components/warp_blas.dp.hpp index 0f2c4644dea..dabc812930f 100644 --- a/dpcpp/components/warp_blas.dp.hpp +++ b/dpcpp/components/warp_blas.dp.hpp @@ -9,13 +9,10 @@ #include #include - #include - #include - #include "dpcpp/base/dpct.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" diff --git a/dpcpp/distributed/index_map_kernels.dp.cpp b/dpcpp/distributed/index_map_kernels.dp.cpp index 84424976778..cf1b28140e1 100644 --- a/dpcpp/distributed/index_map_kernels.dp.cpp +++ b/dpcpp/distributed/index_map_kernels.dp.cpp @@ -4,7 +4,6 @@ #include "core/distributed/index_map_kernels.hpp" - #include "common/unified/base/kernel_launch.hpp" diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp index 5649cb579c9..47adaaeca59 100644 --- a/dpcpp/distributed/matrix_kernels.dp.cpp +++ b/dpcpp/distributed/matrix_kernels.dp.cpp @@ -4,7 +4,6 @@ #include "core/distributed/matrix_kernels.hpp" - #include diff --git a/dpcpp/distributed/partition_helpers_kernels.dp.cpp b/dpcpp/distributed/partition_helpers_kernels.dp.cpp index c7a94baad54..28a0cfd5997 100644 --- a/dpcpp/distributed/partition_helpers_kernels.dp.cpp +++ b/dpcpp/distributed/partition_helpers_kernels.dp.cpp @@ -2,12 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause - #include #include #include - #include "core/distributed/partition_helpers_kernels.hpp" diff --git a/dpcpp/distributed/partition_kernels.dp.cpp b/dpcpp/distributed/partition_kernels.dp.cpp index 5eeb2f85178..175ea3ac050 100644 --- a/dpcpp/distributed/partition_kernels.dp.cpp +++ b/dpcpp/distributed/partition_kernels.dp.cpp @@ -2,14 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause - #include #include - #include "core/distributed/partition_kernels.hpp" - #include "common/unified/base/kernel_launch.hpp" #include "core/components/fill_array_kernels.hpp" #include "dpcpp/base/onedpl.hpp" diff --git a/dpcpp/distributed/vector_kernels.dp.cpp b/dpcpp/distributed/vector_kernels.dp.cpp index 2f7769d37c3..fdc5dd2e52d 100644 --- a/dpcpp/distributed/vector_kernels.dp.cpp +++ b/dpcpp/distributed/vector_kernels.dp.cpp @@ -4,7 +4,6 @@ #include "core/distributed/vector_kernels.hpp" - #include diff --git a/dpcpp/factorization/cholesky_kernels.dp.cpp b/dpcpp/factorization/cholesky_kernels.dp.cpp index b69f50e8dfb..b381e6989e4 100644 --- a/dpcpp/factorization/cholesky_kernels.dp.cpp +++ b/dpcpp/factorization/cholesky_kernels.dp.cpp @@ -4,17 +4,13 @@ #include "core/factorization/cholesky_kernels.hpp" - #include #include - #include - #include - #include "core/factorization/elimination_forest.hpp" diff --git a/dpcpp/factorization/factorization_kernels.dp.cpp b/dpcpp/factorization/factorization_kernels.dp.cpp index 374e966e46d..1d9912b4f12 100644 --- a/dpcpp/factorization/factorization_kernels.dp.cpp +++ b/dpcpp/factorization/factorization_kernels.dp.cpp @@ -4,13 +4,10 @@ #include "core/factorization/factorization_kernels.hpp" - #include - #include - #include "core/base/array_access.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" diff --git a/dpcpp/factorization/lu_kernels.dp.cpp b/dpcpp/factorization/lu_kernels.dp.cpp index c4a471b8c4b..a891b5b7b2f 100644 --- a/dpcpp/factorization/lu_kernels.dp.cpp +++ b/dpcpp/factorization/lu_kernels.dp.cpp @@ -4,14 +4,11 @@ #include "core/factorization/lu_kernels.hpp" - #include #include - #include - #include "core/base/allocator.hpp" #include "core/matrix/csr_lookup.hpp" diff --git a/dpcpp/factorization/par_ic_kernels.dp.cpp b/dpcpp/factorization/par_ic_kernels.dp.cpp index 3f43a488abc..5428460fac5 100644 --- a/dpcpp/factorization/par_ic_kernels.dp.cpp +++ b/dpcpp/factorization/par_ic_kernels.dp.cpp @@ -4,15 +4,12 @@ #include "core/factorization/par_ic_kernels.hpp" - #include - #include #include #include - #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" diff --git a/dpcpp/factorization/par_ict_kernels.dp.cpp b/dpcpp/factorization/par_ict_kernels.dp.cpp index c65fd094955..fb99b662dec 100644 --- a/dpcpp/factorization/par_ict_kernels.dp.cpp +++ b/dpcpp/factorization/par_ict_kernels.dp.cpp @@ -4,20 +4,16 @@ #include "core/factorization/par_ict_kernels.hpp" - #include - #include - #include #include #include #include #include - #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" diff --git a/dpcpp/factorization/par_ilu_kernels.dp.cpp b/dpcpp/factorization/par_ilu_kernels.dp.cpp index 61c059e8c7d..abfd2d72238 100644 --- a/dpcpp/factorization/par_ilu_kernels.dp.cpp +++ b/dpcpp/factorization/par_ilu_kernels.dp.cpp @@ -4,13 +4,10 @@ #include "core/factorization/par_ilu_kernels.hpp" - #include - #include - #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" diff --git a/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp b/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp index 9f3a3062ad6..776ffba3fb1 100644 --- a/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp +++ b/dpcpp/factorization/par_ilut_approx_filter_kernel.dp.cpp @@ -2,24 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - - #include #include - #include - #include #include #include #include #include - #include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp b/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp index 273e21e47fd..5ce9df8a0a9 100644 --- a/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp +++ b/dpcpp/factorization/par_ilut_filter_kernel.dp.cpp @@ -2,20 +2,16 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - - #include - #include #include #include #include #include - #include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/dpcpp/factorization/par_ilut_kernels.dp.cpp b/dpcpp/factorization/par_ilut_kernels.dp.cpp index cfde68b298b..5c9d4c6d769 100644 --- a/dpcpp/factorization/par_ilut_kernels.dp.cpp +++ b/dpcpp/factorization/par_ilut_kernels.dp.cpp @@ -4,22 +4,18 @@ #include "core/factorization/par_ilut_kernels.hpp" - #include #include #include #include - #include - #include #include #include #include - #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" diff --git a/dpcpp/factorization/par_ilut_select_common.dp.cpp b/dpcpp/factorization/par_ilut_select_common.dp.cpp index 1ee22bcef2a..acf383f84a0 100644 --- a/dpcpp/factorization/par_ilut_select_common.dp.cpp +++ b/dpcpp/factorization/par_ilut_select_common.dp.cpp @@ -2,16 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - +#include "dpcpp/factorization/par_ilut_select_common.dp.hpp" #include - #include - #include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/components/atomic.dp.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" @@ -20,7 +18,6 @@ #include "dpcpp/components/searching.dp.hpp" #include "dpcpp/components/sorting.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" -#include "dpcpp/factorization/par_ilut_select_common.dp.hpp" namespace gko { diff --git a/dpcpp/factorization/par_ilut_select_kernel.dp.cpp b/dpcpp/factorization/par_ilut_select_kernel.dp.cpp index 8b5e6f36d10..589f8267f21 100644 --- a/dpcpp/factorization/par_ilut_select_kernel.dp.cpp +++ b/dpcpp/factorization/par_ilut_select_kernel.dp.cpp @@ -2,22 +2,17 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - - #include #include - #include - #include #include #include - #include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/components/atomic.dp.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" diff --git a/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp b/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp index 6ba0c7987cd..246228763bf 100644 --- a/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp +++ b/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp @@ -2,23 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - - #include - #include - #include #include #include #include #include - #include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp b/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp index 9501a35f3c9..601e5dc12d3 100644 --- a/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp +++ b/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp @@ -2,20 +2,16 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - - #include - #include #include #include #include #include - #include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/dpcpp/log/batch_logger.hpp b/dpcpp/log/batch_logger.hpp index 309c624d6fc..c6ba9044db4 100644 --- a/dpcpp/log/batch_logger.hpp +++ b/dpcpp/log/batch_logger.hpp @@ -10,7 +10,6 @@ #include #include - #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" diff --git a/dpcpp/matrix/batch_csr_kernels.dp.cpp b/dpcpp/matrix/batch_csr_kernels.dp.cpp index 31ef1e2e1e1..9feb824a3aa 100644 --- a/dpcpp/matrix/batch_csr_kernels.dp.cpp +++ b/dpcpp/matrix/batch_csr_kernels.dp.cpp @@ -4,17 +4,13 @@ #include "core/matrix/batch_csr_kernels.hpp" - #include - #include - #include #include - #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "dpcpp/base/batch_struct.hpp" diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp index 34efd9525fb..a9f6afce0f5 100644 --- a/dpcpp/matrix/batch_dense_kernels.dp.cpp +++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp @@ -4,19 +4,15 @@ #include "core/matrix/batch_dense_kernels.hpp" - #include - #include - #include #include #include #include - #include "core/base/batch_struct.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/batch_struct.hpp" diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp index 9db98da7108..2cb40dc35eb 100644 --- a/dpcpp/matrix/batch_ell_kernels.dp.cpp +++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp @@ -4,17 +4,13 @@ #include "core/matrix/batch_ell_kernels.hpp" - #include - #include - #include #include - #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "dpcpp/base/batch_struct.hpp" diff --git a/dpcpp/matrix/batch_struct.hpp b/dpcpp/matrix/batch_struct.hpp index e504afdbc81..77b9eb6b3d5 100644 --- a/dpcpp/matrix/batch_struct.hpp +++ b/dpcpp/matrix/batch_struct.hpp @@ -6,14 +6,11 @@ #define GKO_DPCPP_MATRIX_BATCH_STRUCT_HPP_ -#include "core/matrix/batch_struct.hpp" - - #include #include - #include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" namespace gko { diff --git a/dpcpp/matrix/coo_kernels.dp.cpp b/dpcpp/matrix/coo_kernels.dp.cpp index ba7b5013253..595af92b33b 100644 --- a/dpcpp/matrix/coo_kernels.dp.cpp +++ b/dpcpp/matrix/coo_kernels.dp.cpp @@ -4,17 +4,14 @@ #include "core/matrix/coo_kernels.hpp" - #include - #include #include #include #include #include - #include "core/matrix/dense_kernels.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index e276c7520c1..7e5d0229c86 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -4,14 +4,11 @@ #include "core/matrix/csr_kernels.hpp" - #include - #include #include - #include #include #include @@ -21,7 +18,6 @@ #include #include - #include "core/base/array_access.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/base/utils.hpp" diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 1192b893010..04f3229eaed 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -4,11 +4,9 @@ #include "core/matrix/dense_kernels.hpp" - #include #include - #include #include #include @@ -19,7 +17,6 @@ #include #include - #include "core/components/prefix_sum_kernels.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" diff --git a/dpcpp/matrix/diagonal_kernels.dp.cpp b/dpcpp/matrix/diagonal_kernels.dp.cpp index e1ee7ac8b17..2b63138abbe 100644 --- a/dpcpp/matrix/diagonal_kernels.dp.cpp +++ b/dpcpp/matrix/diagonal_kernels.dp.cpp @@ -4,14 +4,11 @@ #include "core/matrix/diagonal_kernels.hpp" - #include - #include #include - #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/helper.hpp" diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index 55ec656ae25..a97cb602d52 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -4,20 +4,16 @@ #include "core/matrix/ell_kernels.hpp" - #include - #include - #include #include #include #include #include - #include "accessor/reduced_row_major.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" diff --git a/dpcpp/matrix/fbcsr_kernels.dp.cpp b/dpcpp/matrix/fbcsr_kernels.dp.cpp index 6a2b43a4165..bf858be51e3 100644 --- a/dpcpp/matrix/fbcsr_kernels.dp.cpp +++ b/dpcpp/matrix/fbcsr_kernels.dp.cpp @@ -4,17 +4,14 @@ #include "core/matrix/fbcsr_kernels.hpp" - #include - #include #include #include #include #include - #include "dpcpp/base/config.hpp" diff --git a/dpcpp/matrix/fft_kernels.dp.cpp b/dpcpp/matrix/fft_kernels.dp.cpp index 713f0d99a0b..83c085e8d15 100644 --- a/dpcpp/matrix/fft_kernels.dp.cpp +++ b/dpcpp/matrix/fft_kernels.dp.cpp @@ -4,7 +4,6 @@ #include "core/matrix/fft_kernels.hpp" - #include #include #include diff --git a/dpcpp/matrix/sellp_kernels.dp.cpp b/dpcpp/matrix/sellp_kernels.dp.cpp index 83078369371..9c0fe717e8a 100644 --- a/dpcpp/matrix/sellp_kernels.dp.cpp +++ b/dpcpp/matrix/sellp_kernels.dp.cpp @@ -4,17 +4,14 @@ #include "core/matrix/sellp_kernels.hpp" - #include - #include #include #include #include #include - #include "core/components/prefix_sum_kernels.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" diff --git a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp index f355216eb08..66c57ac5b35 100644 --- a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp +++ b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp @@ -4,13 +4,10 @@ #include "core/matrix/sparsity_csr_kernels.hpp" - #include - #include - #include "accessor/reduced_row_major.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/synthesizer/implementation_selection.hpp" diff --git a/dpcpp/multigrid/pgm_kernels.dp.cpp b/dpcpp/multigrid/pgm_kernels.dp.cpp index 3241c8b1ed1..a9148c54ff4 100644 --- a/dpcpp/multigrid/pgm_kernels.dp.cpp +++ b/dpcpp/multigrid/pgm_kernels.dp.cpp @@ -2,20 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause - #include - #include "core/multigrid/pgm_kernels.hpp" - #include - #include #include - #include "dpcpp/base/onedpl.hpp" diff --git a/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp b/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp index 752ae1d41de..e66e7141a47 100644 --- a/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp +++ b/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp @@ -4,10 +4,8 @@ #include "core/preconditioner/batch_jacobi_kernels.hpp" - #include - #include "core/base/batch_struct.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp index c4bc933a4d7..4082035ff9f 100644 --- a/dpcpp/preconditioner/isai_kernels.dp.cpp +++ b/dpcpp/preconditioner/isai_kernels.dp.cpp @@ -4,15 +4,12 @@ #include "core/preconditioner/isai_kernels.hpp" - #include - #include #include #include - #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" #include "dpcpp/base/config.hpp" diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp index 0b54a14693c..e8c086ec0a6 100644 --- a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp @@ -2,17 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - #include - #include - #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" +#include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "dpcpp/base/config.hpp" diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp index 2098b7057e7..0e26989808e 100644 --- a/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp +++ b/dpcpp/preconditioner/jacobi_advanced_apply_kernel.dp.cpp @@ -2,13 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - #include - #include "core/matrix/dense_kernels.hpp" +#include "core/preconditioner/jacobi_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "dpcpp/preconditioner/jacobi_common.hpp" diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp index c23e9101d1a..d957ea2c5be 100644 --- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp @@ -2,18 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - #include - #include #include - #include "core/base/extended_float.hpp" #include "core/components/fill_array_kernels.hpp" +#include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "dpcpp/base/config.hpp" diff --git a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp index d6c3d4ce14c..62ff7fdbb51 100644 --- a/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp +++ b/dpcpp/preconditioner/jacobi_generate_kernel.dp.cpp @@ -2,14 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - #include #include - #include "core/components/fill_array_kernels.hpp" +#include "core/preconditioner/jacobi_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "dpcpp/preconditioner/jacobi_common.hpp" diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp index 12b2251c7a5..886f96e88e3 100644 --- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp @@ -4,13 +4,10 @@ #include "core/preconditioner/jacobi_kernels.hpp" - #include - #include - #include "core/base/extended_float.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp index aade299f05b..c088ae8e986 100644 --- a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp @@ -2,17 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - #include - #include - #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" +#include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "dpcpp/base/config.hpp" diff --git a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp index ceed1affd14..25701c6dc55 100644 --- a/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp +++ b/dpcpp/preconditioner/jacobi_simple_apply_kernel.dp.cpp @@ -2,12 +2,9 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - #include - +#include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "dpcpp/preconditioner/jacobi_common.hpp" diff --git a/dpcpp/reorder/rcm_kernels.dp.cpp b/dpcpp/reorder/rcm_kernels.dp.cpp index 95a8fa38b80..350b4c90a6d 100644 --- a/dpcpp/reorder/rcm_kernels.dp.cpp +++ b/dpcpp/reorder/rcm_kernels.dp.cpp @@ -4,10 +4,8 @@ #include "core/reorder/rcm_kernels.hpp" - #include - #include #include #include diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp index aab068d103e..344e4af56b9 100644 --- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp +++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp @@ -4,15 +4,12 @@ #include "core/solver/batch_bicgstab_kernels.hpp" - #include - #include #include #include - #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp index 02c40424a35..0787afa6fd3 100644 --- a/dpcpp/solver/batch_cg_kernels.dp.cpp +++ b/dpcpp/solver/batch_cg_kernels.dp.cpp @@ -4,15 +4,12 @@ #include "core/solver/batch_cg_kernels.hpp" - #include - #include #include #include - #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp index dbf4bdfadcb..7ab010ba29f 100644 --- a/dpcpp/solver/cb_gmres_kernels.dp.cpp +++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp @@ -4,19 +4,15 @@ #include "core/solver/cb_gmres_kernels.hpp" - #include - #include - #include #include #include #include - #include "accessor/range.hpp" #include "accessor/reduced_row_major.hpp" #include "accessor/scaled_reduced_row_major.hpp" diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index fdd924ad4d6..d59ada362f9 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -2,21 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/solver/idr_kernels.hpp" +#include +#include "core/solver/idr_kernels.hpp" #include #include - #include -#include - #include #include - #include "core/components/fill_array_kernels.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" diff --git a/dpcpp/solver/lower_trs_kernels.dp.cpp b/dpcpp/solver/lower_trs_kernels.dp.cpp index 6c4f8fa8537..449bfe5cfcf 100644 --- a/dpcpp/solver/lower_trs_kernels.dp.cpp +++ b/dpcpp/solver/lower_trs_kernels.dp.cpp @@ -4,13 +4,10 @@ #include "core/solver/lower_trs_kernels.hpp" - #include - #include - #include #include #include diff --git a/dpcpp/solver/multigrid_kernels.dp.cpp b/dpcpp/solver/multigrid_kernels.dp.cpp index d818211c28b..aaf0ab63354 100644 --- a/dpcpp/solver/multigrid_kernels.dp.cpp +++ b/dpcpp/solver/multigrid_kernels.dp.cpp @@ -4,13 +4,11 @@ #include "core/solver/multigrid_kernels.hpp" - #include #include #include #include - #include "core/components/fill_array_kernels.hpp" diff --git a/dpcpp/solver/upper_trs_kernels.dp.cpp b/dpcpp/solver/upper_trs_kernels.dp.cpp index 3729492eb18..7ac4950fe82 100644 --- a/dpcpp/solver/upper_trs_kernels.dp.cpp +++ b/dpcpp/solver/upper_trs_kernels.dp.cpp @@ -4,13 +4,10 @@ #include "core/solver/upper_trs_kernels.hpp" - #include - #include - #include #include #include diff --git a/dpcpp/stop/batch_criteria.hpp b/dpcpp/stop/batch_criteria.hpp index 3818831df11..a0b12326302 100644 --- a/dpcpp/stop/batch_criteria.hpp +++ b/dpcpp/stop/batch_criteria.hpp @@ -10,7 +10,6 @@ #include #include - #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" diff --git a/dpcpp/stop/criterion_kernels.dp.cpp b/dpcpp/stop/criterion_kernels.dp.cpp index ea8ab78aace..2970263f6ae 100644 --- a/dpcpp/stop/criterion_kernels.dp.cpp +++ b/dpcpp/stop/criterion_kernels.dp.cpp @@ -4,10 +4,8 @@ #include "core/stop/criterion_kernels.hpp" - #include - #include diff --git a/dpcpp/stop/residual_norm_kernels.dp.cpp b/dpcpp/stop/residual_norm_kernels.dp.cpp index 09aae963b16..ddb617a1a84 100644 --- a/dpcpp/stop/residual_norm_kernels.dp.cpp +++ b/dpcpp/stop/residual_norm_kernels.dp.cpp @@ -4,15 +4,12 @@ #include "core/stop/residual_norm_kernels.hpp" - #include - #include #include #include - #include "core/base/array_access.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" diff --git a/dpcpp/synthesizer/implementation_selection.hpp b/dpcpp/synthesizer/implementation_selection.hpp index 05cea2521b0..9bec1a42cd5 100644 --- a/dpcpp/synthesizer/implementation_selection.hpp +++ b/dpcpp/synthesizer/implementation_selection.hpp @@ -8,11 +8,9 @@ #include - #include #include - #include "dpcpp/base/config.hpp" diff --git a/dpcpp/test/base/dim3.dp.cpp b/dpcpp/test/base/dim3.dp.cpp index 6688e4e4163..cf0e5d1da30 100644 --- a/dpcpp/test/base/dim3.dp.cpp +++ b/dpcpp/test/base/dim3.dp.cpp @@ -4,10 +4,8 @@ #include "dpcpp/base/dim3.dp.hpp" - #include - #include diff --git a/dpcpp/test/base/executor.dp.cpp b/dpcpp/test/base/executor.dp.cpp index 771330e08bf..83a29a3b6db 100644 --- a/dpcpp/test/base/executor.dp.cpp +++ b/dpcpp/test/base/executor.dp.cpp @@ -2,23 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include - #include - #include - #include #include #include +#include namespace { diff --git a/dpcpp/test/base/kernel_launch.dp.cpp b/dpcpp/test/base/kernel_launch.dp.cpp index e95fac1082a..a6687583340 100644 --- a/dpcpp/test/base/kernel_launch.dp.cpp +++ b/dpcpp/test/base/kernel_launch.dp.cpp @@ -4,20 +4,16 @@ #include "common/unified/base/kernel_launch.hpp" - #include #include - #include - #include #include #include #include - #include "common/unified/base/kernel_launch_reduction.hpp" #include "common/unified/base/kernel_launch_solver.hpp" #include "core/base/array_access.hpp" diff --git a/dpcpp/test/components/cooperative_groups.dp.cpp b/dpcpp/test/components/cooperative_groups.dp.cpp index ab94fc0364b..27e14b62d2d 100644 --- a/dpcpp/test/components/cooperative_groups.dp.cpp +++ b/dpcpp/test/components/cooperative_groups.dp.cpp @@ -4,22 +4,17 @@ #include "dpcpp/components/cooperative_groups.dp.hpp" - #include #include - #include - #include - #include #include #include - #include "core/base/types.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "core/test/utils/assertions.hpp" diff --git a/dpcpp/test/matrix/fbcsr_kernels.dp.cpp b/dpcpp/test/matrix/fbcsr_kernels.dp.cpp index 6d6f9fb6e93..98849e4fe00 100644 --- a/dpcpp/test/matrix/fbcsr_kernels.dp.cpp +++ b/dpcpp/test/matrix/fbcsr_kernels.dp.cpp @@ -4,10 +4,8 @@ #include - #include - #include "core/test/matrix/fbcsr_sample.hpp" #include "core/test/utils.hpp" diff --git a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp index 3b62c328366..6dcfe460c71 100644 --- a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp @@ -2,22 +2,17 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include - #include - #include #include #include #include - +#include #include "core/test/utils.hpp" #include "core/test/utils/unsort_matrix.hpp" diff --git a/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp b/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp index 04dde86a07a..541798b8c00 100644 --- a/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp +++ b/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp @@ -2,15 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include #include +#include + int main(int argc, char* argv[]) { diff --git a/examples/cb-gmres/cb-gmres.cpp b/examples/cb-gmres/cb-gmres.cpp index 84b9c37592b..3eb221b3a48 100644 --- a/examples/cb-gmres/cb-gmres.cpp +++ b/examples/cb-gmres/cb-gmres.cpp @@ -3,8 +3,6 @@ // SPDX-License-Identifier: BSD-3-Clause // This is the main ginkgo header file. -#include - #include #include #include @@ -12,6 +10,8 @@ #include #include +#include + // Helper function which measures the time of `solver->apply(b, x)` in seconds // To get an accurate result, the solve is repeated multiple times (while diff --git a/examples/custom-matrix-format/custom-matrix-format.cpp b/examples/custom-matrix-format/custom-matrix-format.cpp index a5e3cc94997..d2ec94215b3 100644 --- a/examples/custom-matrix-format/custom-matrix-format.cpp +++ b/examples/custom-matrix-format/custom-matrix-format.cpp @@ -6,8 +6,8 @@ #include #include - #include + #include diff --git a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp index 39baed56f67..030e11323af 100644 --- a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp +++ b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp @@ -2,15 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include #include +#include + /** * The ByInteraction class is a criterion which asks for user input to stop diff --git a/examples/external-lib-interfacing/external-lib-interfacing.cpp b/examples/external-lib-interfacing/external-lib-interfacing.cpp index 04824cb9578..a3b37b00b1a 100644 --- a/examples/external-lib-interfacing/external-lib-interfacing.cpp +++ b/examples/external-lib-interfacing/external-lib-interfacing.cpp @@ -69,7 +69,6 @@ // not unlike the Function class, but with the difference that // the return value is tensor-valued rather than scalar of vector-valued. #include - #include // Ginkgo's header file diff --git a/examples/ginkgo-overhead/ginkgo-overhead.cpp b/examples/ginkgo-overhead/ginkgo-overhead.cpp index 5330dda1e7d..c36cf60c39c 100644 --- a/examples/ginkgo-overhead/ginkgo-overhead.cpp +++ b/examples/ginkgo-overhead/ginkgo-overhead.cpp @@ -2,13 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include +#include + [[noreturn]] void print_usage_and_exit(const char* name) { diff --git a/examples/ginkgo-ranges/ginkgo-ranges.cpp b/examples/ginkgo-ranges/ginkgo-ranges.cpp index 38486a25b2e..503ee8b62e3 100644 --- a/examples/ginkgo-ranges/ginkgo-ranges.cpp +++ b/examples/ginkgo-ranges/ginkgo-ranges.cpp @@ -2,10 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include #include #include +#include + // LU factorization implementation using Ginkgo ranges // For simplicity, we only consider square matrices, and no pivoting. diff --git a/examples/heat-equation/heat-equation.cpp b/examples/heat-equation/heat-equation.cpp index c026c343997..286559e1cc3 100644 --- a/examples/heat-equation/heat-equation.cpp +++ b/examples/heat-equation/heat-equation.cpp @@ -36,17 +36,15 @@ vector initialization, solver setup and the use of Ginkgo in a more complex setting. ***************************************************************/ -#include - - #include #include #include - #include #include +#include + // This function implements a simple Ginkgo-themed clamped color mapping for // values in the range [0,5]. diff --git a/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp b/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp index ad7e1c07158..54a45f0f2e1 100644 --- a/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp +++ b/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp @@ -2,15 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include #include +#include + int main(int argc, char* argv[]) { diff --git a/examples/inverse-iteration/inverse-iteration.cpp b/examples/inverse-iteration/inverse-iteration.cpp index 03c9f1fe5e8..a348cfe635c 100644 --- a/examples/inverse-iteration/inverse-iteration.cpp +++ b/examples/inverse-iteration/inverse-iteration.cpp @@ -2,9 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include @@ -13,6 +10,8 @@ #include #include +#include + int main(int argc, char* argv[]) { diff --git a/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp b/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp index 34fc684bcf6..10126427441 100644 --- a/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp +++ b/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp @@ -2,15 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include #include +#include + int main(int argc, char* argv[]) { diff --git a/examples/iterative-refinement/iterative-refinement.cpp b/examples/iterative-refinement/iterative-refinement.cpp index aa38e54ede2..4684b425b0f 100644 --- a/examples/iterative-refinement/iterative-refinement.cpp +++ b/examples/iterative-refinement/iterative-refinement.cpp @@ -2,15 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include #include +#include + int main(int argc, char* argv[]) { diff --git a/examples/kokkos-assembly/kokkos-assembly.cpp b/examples/kokkos-assembly/kokkos-assembly.cpp index d1c19d1b3e7..3eed9271d6c 100644 --- a/examples/kokkos-assembly/kokkos-assembly.cpp +++ b/examples/kokkos-assembly/kokkos-assembly.cpp @@ -5,12 +5,11 @@ #include #include - #include +#include #include -#include namespace gko::ext::kokkos::detail { diff --git a/examples/minimal-cuda-solver/minimal-cuda-solver.cpp b/examples/minimal-cuda-solver/minimal-cuda-solver.cpp index 0d4ba7d67d4..848742cf544 100644 --- a/examples/minimal-cuda-solver/minimal-cuda-solver.cpp +++ b/examples/minimal-cuda-solver/minimal-cuda-solver.cpp @@ -2,9 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include #include +#include + int main() { // Instantiate a CUDA executor diff --git a/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp b/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp index 6690c8e13d3..d598bb48a46 100644 --- a/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp +++ b/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp @@ -2,15 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include #include +#include + int main(int argc, char* argv[]) { diff --git a/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp b/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp index 08575d6306c..383c721a3e1 100644 --- a/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp +++ b/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp @@ -2,15 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include #include +#include + int main(int argc, char* argv[]) { diff --git a/examples/mixed-precision-ir/mixed-precision-ir.cpp b/examples/mixed-precision-ir/mixed-precision-ir.cpp index ed6fda2c689..4e8b37f6732 100644 --- a/examples/mixed-precision-ir/mixed-precision-ir.cpp +++ b/examples/mixed-precision-ir/mixed-precision-ir.cpp @@ -2,15 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include #include +#include + int main(int argc, char* argv[]) { diff --git a/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp b/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp index 9b114b611af..962e96c69a2 100644 --- a/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp +++ b/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp @@ -2,15 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include #include +#include + int main(int argc, char* argv[]) { diff --git a/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp b/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp index 51f17b7821c..64d39e806f3 100644 --- a/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp +++ b/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp @@ -2,15 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include #include +#include + int main(int argc, char* argv[]) { diff --git a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp index 59c756e2a69..155d4a59370 100644 --- a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp +++ b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp @@ -43,12 +43,13 @@ hand side vector changes when increasing the dimension. #include #include -#include #include #include #include #include +#include + // Stencil values. Ordering can be seen in the main function // Can also be changed by passing additional parameter when executing constexpr double default_alpha = 10.0 / 3.0; diff --git a/examples/papi-logging/papi-logging.cpp b/examples/papi-logging/papi-logging.cpp index 6be633aff03..159d5cf647d 100644 --- a/examples/papi-logging/papi-logging.cpp +++ b/examples/papi-logging/papi-logging.cpp @@ -2,16 +2,16 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - -#include #include #include #include #include #include +#include + +#include + namespace { diff --git a/examples/par-ilu-convergence/par-ilu-convergence.cpp b/examples/par-ilu-convergence/par-ilu-convergence.cpp index bf0e4e7a990..72e72cf7480 100644 --- a/examples/par-ilu-convergence/par-ilu-convergence.cpp +++ b/examples/par-ilu-convergence/par-ilu-convergence.cpp @@ -9,7 +9,6 @@ #include #include - #include diff --git a/examples/performance-debugging/performance-debugging.cpp b/examples/performance-debugging/performance-debugging.cpp index 00dafc45378..9f956106fd5 100644 --- a/examples/performance-debugging/performance-debugging.cpp +++ b/examples/performance-debugging/performance-debugging.cpp @@ -2,9 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include @@ -20,6 +17,8 @@ #include #include +#include + template using vec = gko::matrix::Dense; diff --git a/examples/poisson-solver/poisson-solver.cpp b/examples/poisson-solver/poisson-solver.cpp index d70dd1aa506..f508869c63d 100644 --- a/examples/poisson-solver/poisson-solver.cpp +++ b/examples/poisson-solver/poisson-solver.cpp @@ -2,12 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include #include #include #include #include +#include + // Creates a stencil matrix in CSR format for the given number of discretization // points. diff --git a/examples/preconditioned-solver/preconditioned-solver.cpp b/examples/preconditioned-solver/preconditioned-solver.cpp index 2291c3cb2ed..0284fdf26cc 100644 --- a/examples/preconditioned-solver/preconditioned-solver.cpp +++ b/examples/preconditioned-solver/preconditioned-solver.cpp @@ -2,14 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include +#include + int main(int argc, char* argv[]) { diff --git a/examples/preconditioner-export/preconditioner-export.cpp b/examples/preconditioner-export/preconditioner-export.cpp index e6a405cde4a..c37951bcaff 100644 --- a/examples/preconditioner-export/preconditioner-export.cpp +++ b/examples/preconditioner-export/preconditioner-export.cpp @@ -2,9 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include @@ -12,6 +9,8 @@ #include #include +#include + const std::map()>> executors{{"reference", [] { return gko::ReferenceExecutor::create(); }}, diff --git a/examples/reordered-preconditioned-solver/reordered-preconditioned-solver.cpp b/examples/reordered-preconditioned-solver/reordered-preconditioned-solver.cpp index 490e36ad387..7a227fd0ee2 100644 --- a/examples/reordered-preconditioned-solver/reordered-preconditioned-solver.cpp +++ b/examples/reordered-preconditioned-solver/reordered-preconditioned-solver.cpp @@ -7,7 +7,6 @@ #include #include - #include diff --git a/examples/schroedinger-splitting/schroedinger-splitting.cpp b/examples/schroedinger-splitting/schroedinger-splitting.cpp index 4390287c30f..cadb186c23b 100644 --- a/examples/schroedinger-splitting/schroedinger-splitting.cpp +++ b/examples/schroedinger-splitting/schroedinger-splitting.cpp @@ -47,8 +47,6 @@ to the non-linear part, which turns it into the Gross–Pitaevskii equation. ***************************************************************/ -#include - #include #include #include @@ -58,6 +56,8 @@ to the non-linear part, which turns it into the Gross–Pitaevskii equation. #include #include +#include + // This function implements a simple Ginkgo-themed clamped color mapping for // values in the range [0,5]. diff --git a/examples/simple-solver-logging/simple-solver-logging.cpp b/examples/simple-solver-logging/simple-solver-logging.cpp index 3bcbd834bc3..158f94cff25 100644 --- a/examples/simple-solver-logging/simple-solver-logging.cpp +++ b/examples/simple-solver-logging/simple-solver-logging.cpp @@ -2,15 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include #include +#include + namespace { diff --git a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp index a28ab925c88..0f77d69cf3d 100644 --- a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp +++ b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp @@ -41,12 +41,13 @@ use Ginkgo, and the only part where Ginkgo is introduced is inside the `solve_system` function. ***************************************************************/ -#include #include #include #include #include +#include + // Creates a stencil matrix in CSR format for the given number of discretization // points. diff --git a/extensions/test/config/json_config.cpp b/extensions/test/config/json_config.cpp index 13191a2ff9a..a46cdd93628 100644 --- a/extensions/test/config/json_config.cpp +++ b/extensions/test/config/json_config.cpp @@ -4,15 +4,12 @@ #include - #include #include - #include #include - #include "core/test/utils.hpp" #include "extensions/test/config/file_location.hpp" diff --git a/extensions/test/kokkos/kokkos_main.cpp b/extensions/test/kokkos/kokkos_main.cpp index e541d362244..7a85c379cdd 100644 --- a/extensions/test/kokkos/kokkos_main.cpp +++ b/extensions/test/kokkos/kokkos_main.cpp @@ -4,10 +4,8 @@ #include - #include - #include "core/test/gtest/environments.hpp" diff --git a/extensions/test/kokkos/spaces.cpp b/extensions/test/kokkos/spaces.cpp index 47e24aac93e..e15c3579564 100644 --- a/extensions/test/kokkos/spaces.cpp +++ b/extensions/test/kokkos/spaces.cpp @@ -4,13 +4,10 @@ #include - #include - #include - #include "core/test/gtest/environments.hpp" #include "core/test/utils.hpp" diff --git a/extensions/test/kokkos/types.cpp b/extensions/test/kokkos/types.cpp index 4bff41499e9..bb3252b149c 100644 --- a/extensions/test/kokkos/types.cpp +++ b/extensions/test/kokkos/types.cpp @@ -5,18 +5,14 @@ #include #include - #include - #include - #include #include #include - #include "core/test/utils.hpp" diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp index 74e6c34dc5d..86b16c8975d 100644 --- a/hip/base/batch_multi_vector_kernels.hip.cpp +++ b/hip/base/batch_multi_vector_kernels.hip.cpp @@ -4,15 +4,12 @@ #include "core/base/batch_multi_vector_kernels.hpp" - #include #include - #include #include - #include "common/cuda_hip/base/blas_bindings.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/pointer_mode_guard.hpp" diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp index 4f09ec66bb8..3e4cba6a747 100644 --- a/hip/base/batch_struct.hip.hpp +++ b/hip/base/batch_struct.hip.hpp @@ -9,7 +9,6 @@ #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" #include "core/base/batch_struct.hpp" diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp index 89dc67255fc..e74153cc34e 100644 --- a/hip/base/config.hip.hpp +++ b/hip/base/config.hip.hpp @@ -8,7 +8,6 @@ #include - #include "common/cuda_hip/base/runtime.hpp" #include "hip/base/math.hip.hpp" diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp index d1d4325c6f1..f9e5dadce52 100644 --- a/hip/base/device.hip.cpp +++ b/hip/base/device.hip.cpp @@ -4,12 +4,10 @@ #include "ginkgo/core/base/device.hpp" - #include #include #include - #include "common/cuda_hip/base/runtime.hpp" #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/base/device_matrix_data_kernels.hip.cpp b/hip/base/device_matrix_data_kernels.hip.cpp index 5a0b762ea57..d63a8e27ed5 100644 --- a/hip/base/device_matrix_data_kernels.hip.cpp +++ b/hip/base/device_matrix_data_kernels.hip.cpp @@ -4,7 +4,6 @@ #include "core/base/device_matrix_data_kernels.hpp" - #include #include #include @@ -13,7 +12,6 @@ #include #include - #include "common/cuda_hip/base/types.hpp" #include "hip/base/thrust.hip.hpp" diff --git a/hip/base/exception.hip.cpp b/hip/base/exception.hip.cpp index 05b030ad375..c83778951d0 100644 --- a/hip/base/exception.hip.cpp +++ b/hip/base/exception.hip.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/base/exception.hpp" - #include @@ -21,7 +20,6 @@ #include - #include "common/cuda_hip/base/runtime.hpp" diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp index e371e48f489..9e09912c5c9 100644 --- a/hip/base/executor.hip.cpp +++ b/hip/base/executor.hip.cpp @@ -4,15 +4,12 @@ #include "ginkgo/core/base/executor.hpp" - #include - #include #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "hip/base/hipblas_bindings.hip.hpp" diff --git a/hip/base/hipblas_bindings.hip.hpp b/hip/base/hipblas_bindings.hip.hpp index d5dc94d6138..21c44e664b8 100644 --- a/hip/base/hipblas_bindings.hip.hpp +++ b/hip/base/hipblas_bindings.hip.hpp @@ -16,7 +16,6 @@ #include #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp index 9fd7ade8231..a76274c45a7 100644 --- a/hip/base/hiprand_bindings.hip.hpp +++ b/hip/base/hiprand_bindings.hip.hpp @@ -15,7 +15,6 @@ #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" diff --git a/hip/base/hipsparse_bindings.hip.hpp b/hip/base/hipsparse_bindings.hip.hpp index 0337f0a03c6..af01f9dc94a 100644 --- a/hip/base/hipsparse_bindings.hip.hpp +++ b/hip/base/hipsparse_bindings.hip.hpp @@ -16,7 +16,6 @@ #include #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/hip/base/hipsparse_block_bindings.hip.hpp b/hip/base/hipsparse_block_bindings.hip.hpp index 6fb70c4571c..d68ceb48ddd 100644 --- a/hip/base/hipsparse_block_bindings.hip.hpp +++ b/hip/base/hipsparse_block_bindings.hip.hpp @@ -15,7 +15,6 @@ #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "hip/base/hipsparse_bindings.hip.hpp" diff --git a/hip/base/index_set_kernels.hip.cpp b/hip/base/index_set_kernels.hip.cpp index a246b5bf57e..9f9f967fe35 100644 --- a/hip/base/index_set_kernels.hip.cpp +++ b/hip/base/index_set_kernels.hip.cpp @@ -4,10 +4,8 @@ #include "core/base/index_set_kernels.hpp" - #include - #include #include #include diff --git a/hip/base/kernel_launch.hip.hpp b/hip/base/kernel_launch.hip.hpp index 890b9922a4c..ff9f398c0bc 100644 --- a/hip/base/kernel_launch.hip.hpp +++ b/hip/base/kernel_launch.hip.hpp @@ -10,7 +10,6 @@ #include - #include "accessor/cuda_hip_helper.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/hip/base/math.hip.hpp b/hip/base/math.hip.hpp index f9427089126..9f577812f3e 100644 --- a/hip/base/math.hip.hpp +++ b/hip/base/math.hip.hpp @@ -6,11 +6,10 @@ #define GKO_HIP_BASE_MATH_HIP_HPP_ -#include - - #include +#include + namespace gko { diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp index 27d510d784b..6ac3070192a 100644 --- a/hip/base/memory.hip.cpp +++ b/hip/base/memory.hip.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/base/memory.hpp" - #include - #include "common/cuda_hip/base/runtime.hpp" #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/base/pointer_mode_guard.hip.hpp b/hip/base/pointer_mode_guard.hip.hpp index 5cd4b3ec58f..d14c8468c0b 100644 --- a/hip/base/pointer_mode_guard.hip.hpp +++ b/hip/base/pointer_mode_guard.hip.hpp @@ -22,7 +22,6 @@ #include #include - #include "common/cuda_hip/base/runtime.hpp" diff --git a/hip/base/roctx.hip.cpp b/hip/base/roctx.hip.cpp index 46dad3be816..9f4f44ec815 100644 --- a/hip/base/roctx.hip.cpp +++ b/hip/base/roctx.hip.cpp @@ -4,7 +4,6 @@ #include - #include "common/cuda_hip/base/runtime.hpp" diff --git a/hip/base/scoped_device_id.hip.cpp b/hip/base/scoped_device_id.hip.cpp index 1fd7211b106..e16c2b5701a 100644 --- a/hip/base/scoped_device_id.hip.cpp +++ b/hip/base/scoped_device_id.hip.cpp @@ -2,15 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "hip/base/scoped_device_id.hip.hpp" + #include #include - #include - #include "common/cuda_hip/base/runtime.hpp" -#include "hip/base/scoped_device_id.hip.hpp" namespace gko { diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp index d5acb978e22..d57f63c4e7c 100644 --- a/hip/base/stream.hip.cpp +++ b/hip/base/stream.hip.cpp @@ -4,12 +4,10 @@ #include "ginkgo/core/base/stream.hpp" - #include #include #include - #include "common/cuda_hip/base/runtime.hpp" #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/base/thrust.hip.hpp b/hip/base/thrust.hip.hpp index 2c0412fb67d..2aecdd79328 100644 --- a/hip/base/thrust.hip.hpp +++ b/hip/base/thrust.hip.hpp @@ -8,7 +8,6 @@ #include - #include #include #if GINKGO_HIP_PLATFORM_HCC diff --git a/hip/base/timer.hip.cpp b/hip/base/timer.hip.cpp index 67a9a8153b6..800f4a739c1 100644 --- a/hip/base/timer.hip.cpp +++ b/hip/base/timer.hip.cpp @@ -4,10 +4,8 @@ #include "ginkgo/core/base/timer.hpp" - #include - #include "common/cuda_hip/base/runtime.hpp" #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index 9ae2224c064..bb0d4a2d0c9 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -6,15 +6,13 @@ #define GKO_HIP_BASE_TYPES_HIP_HPP_ -#include - - #include - #include #include +#include + #if HIP_VERSION >= 50200000 #include @@ -23,10 +21,8 @@ #endif #include - #include - #include "common/cuda_hip/base/runtime.hpp" diff --git a/hip/components/atomic.hip.hpp b/hip/components/atomic.hip.hpp index 0dc8d7a3b46..64d39a90d78 100644 --- a/hip/components/atomic.hip.hpp +++ b/hip/components/atomic.hip.hpp @@ -8,7 +8,6 @@ #include - #include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp index e81441a092b..d3dbc44a5c8 100644 --- a/hip/components/cooperative_groups.hip.hpp +++ b/hip/components/cooperative_groups.hip.hpp @@ -8,7 +8,6 @@ #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/hip/components/diagonal_block_manipulation.hip.hpp b/hip/components/diagonal_block_manipulation.hip.hpp index 290511e7583..7a3893fa031 100644 --- a/hip/components/diagonal_block_manipulation.hip.hpp +++ b/hip/components/diagonal_block_manipulation.hip.hpp @@ -8,7 +8,6 @@ #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp index 07daf486d84..d2cbc3062a5 100644 --- a/hip/components/format_conversion.hip.hpp +++ b/hip/components/format_conversion.hip.hpp @@ -9,7 +9,6 @@ #include #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/components/memory.hip.hpp b/hip/components/memory.hip.hpp index 4bb6fa19ec0..d8238c11795 100644 --- a/hip/components/memory.hip.hpp +++ b/hip/components/memory.hip.hpp @@ -9,10 +9,8 @@ #include #include - #include - #include "common/cuda_hip/base/types.hpp" diff --git a/hip/components/prefix_sum.hip.hpp b/hip/components/prefix_sum.hip.hpp index 5acde03cbec..deb78288e6c 100644 --- a/hip/components/prefix_sum.hip.hpp +++ b/hip/components/prefix_sum.hip.hpp @@ -8,7 +8,6 @@ #include - #include "common/cuda_hip/base/blas_bindings.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "hip/components/reduction.hip.hpp" diff --git a/hip/components/prefix_sum_kernels.hip.cpp b/hip/components/prefix_sum_kernels.hip.cpp index ad55c0954d1..283e8c161a1 100644 --- a/hip/components/prefix_sum_kernels.hip.cpp +++ b/hip/components/prefix_sum_kernels.hip.cpp @@ -4,18 +4,14 @@ #include "core/components/prefix_sum_kernels.hpp" - #include - #include - #include #include #include - #include "hip/base/thrust.hip.hpp" diff --git a/hip/components/reduction.hip.hpp b/hip/components/reduction.hip.hpp index fb0539952ff..bc2594dd96d 100644 --- a/hip/components/reduction.hip.hpp +++ b/hip/components/reduction.hip.hpp @@ -8,11 +8,9 @@ #include - #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/hip/components/syncfree.hip.hpp b/hip/components/syncfree.hip.hpp index 7627a0a2781..c174224c9c4 100644 --- a/hip/components/syncfree.hip.hpp +++ b/hip/components/syncfree.hip.hpp @@ -8,7 +8,6 @@ #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/cuda_hip/components/memory.hpp" diff --git a/hip/components/warp_blas.hip.hpp b/hip/components/warp_blas.hip.hpp index 8ac59719aa7..9164a1914b3 100644 --- a/hip/components/warp_blas.hip.hpp +++ b/hip/components/warp_blas.hip.hpp @@ -9,10 +9,8 @@ #include #include - #include - #include "hip/base/math.hip.hpp" #include "hip/components/reduction.hip.hpp" diff --git a/hip/distributed/index_map_kernels.hip.cpp b/hip/distributed/index_map_kernels.hip.cpp index d45674a66a3..536b09a1bb1 100644 --- a/hip/distributed/index_map_kernels.hip.cpp +++ b/hip/distributed/index_map_kernels.hip.cpp @@ -4,7 +4,6 @@ #include "core/distributed/index_map_kernels.hpp" - #include #include #include @@ -18,10 +17,8 @@ #include #include - #include - #include "hip/base/thrust.hip.hpp" #include "hip/components/atomic.hip.hpp" #include "hip/components/searching.hip.hpp" diff --git a/hip/distributed/matrix_kernels.hip.cpp b/hip/distributed/matrix_kernels.hip.cpp index 54cde64c429..535fdaacb44 100644 --- a/hip/distributed/matrix_kernels.hip.cpp +++ b/hip/distributed/matrix_kernels.hip.cpp @@ -4,7 +4,6 @@ #include "core/distributed/matrix_kernels.hpp" - #include #include #include @@ -17,10 +16,8 @@ #include #include - #include - #include "hip/base/thrust.hip.hpp" #include "hip/components/atomic.hip.hpp" diff --git a/hip/distributed/partition_helpers_kernels.hip.cpp b/hip/distributed/partition_helpers_kernels.hip.cpp index 744d8de887b..a2083a55303 100644 --- a/hip/distributed/partition_helpers_kernels.hip.cpp +++ b/hip/distributed/partition_helpers_kernels.hip.cpp @@ -4,13 +4,11 @@ #include "core/distributed/partition_helpers_kernels.hpp" - #include #include #include #include - #include "hip/base/thrust.hip.hpp" diff --git a/hip/distributed/partition_kernels.hip.cpp b/hip/distributed/partition_kernels.hip.cpp index 00dc74b910f..c2c4a8f28ea 100644 --- a/hip/distributed/partition_kernels.hip.cpp +++ b/hip/distributed/partition_kernels.hip.cpp @@ -4,7 +4,6 @@ #include "core/distributed/partition_kernels.hpp" - #include #include #include @@ -12,7 +11,6 @@ #include #include - #include "common/unified/base/kernel_launch.hpp" #include "core/components/fill_array_kernels.hpp" #include "hip/base/thrust.hip.hpp" diff --git a/hip/distributed/vector_kernels.hip.cpp b/hip/distributed/vector_kernels.hip.cpp index fc6718dec0d..eff7936076d 100644 --- a/hip/distributed/vector_kernels.hip.cpp +++ b/hip/distributed/vector_kernels.hip.cpp @@ -4,7 +4,6 @@ #include "core/distributed/vector_kernels.hpp" - #include #include #include @@ -12,10 +11,8 @@ #include #include - #include - #include "hip/base/thrust.hip.hpp" diff --git a/hip/factorization/cholesky_kernels.hip.cpp b/hip/factorization/cholesky_kernels.hip.cpp index 419db21b811..1c1ce1d3170 100644 --- a/hip/factorization/cholesky_kernels.hip.cpp +++ b/hip/factorization/cholesky_kernels.hip.cpp @@ -4,11 +4,9 @@ #include "core/factorization/cholesky_kernels.hpp" - #include #include - #include #include #include @@ -16,10 +14,8 @@ #include #include - #include - #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/fill_array_kernels.hpp" diff --git a/hip/factorization/factorization_kernels.hip.cpp b/hip/factorization/factorization_kernels.hip.cpp index 4080768bc07..d6768e5e9c6 100644 --- a/hip/factorization/factorization_kernels.hip.cpp +++ b/hip/factorization/factorization_kernels.hip.cpp @@ -4,10 +4,8 @@ #include "core/factorization/factorization_kernels.hpp" - #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/hip/factorization/ic_kernels.hip.cpp b/hip/factorization/ic_kernels.hip.cpp index edda974fd36..cfbb12bd5b3 100644 --- a/hip/factorization/ic_kernels.hip.cpp +++ b/hip/factorization/ic_kernels.hip.cpp @@ -4,10 +4,8 @@ #include "core/factorization/ic_kernels.hpp" - #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" diff --git a/hip/factorization/ilu_kernels.hip.cpp b/hip/factorization/ilu_kernels.hip.cpp index f50df5ca75b..45d468d0500 100644 --- a/hip/factorization/ilu_kernels.hip.cpp +++ b/hip/factorization/ilu_kernels.hip.cpp @@ -4,10 +4,8 @@ #include "core/factorization/ilu_kernels.hpp" - #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" diff --git a/hip/factorization/lu_kernels.hip.cpp b/hip/factorization/lu_kernels.hip.cpp index ec3e771134e..8e37d1a2445 100644 --- a/hip/factorization/lu_kernels.hip.cpp +++ b/hip/factorization/lu_kernels.hip.cpp @@ -4,19 +4,15 @@ #include "core/factorization/lu_kernels.hpp" - #include #include - #include #include #include - #include - #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/allocator.hpp" diff --git a/hip/factorization/par_ic_kernels.hip.cpp b/hip/factorization/par_ic_kernels.hip.cpp index e4cd0b2470b..f0e0cb0b632 100644 --- a/hip/factorization/par_ic_kernels.hip.cpp +++ b/hip/factorization/par_ic_kernels.hip.cpp @@ -4,12 +4,10 @@ #include "core/factorization/par_ic_kernels.hpp" - #include #include #include - #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/memory.hpp" #include "hip/base/math.hip.hpp" diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp index 7f5dba82eba..99b2f09274b 100644 --- a/hip/factorization/par_ict_kernels.hip.cpp +++ b/hip/factorization/par_ict_kernels.hip.cpp @@ -4,14 +4,12 @@ #include "core/factorization/par_ict_kernels.hpp" - #include #include #include #include #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/components/memory.hpp" #include "core/components/prefix_sum_kernels.hpp" diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp index 49608d6801f..b4897a23cf9 100644 --- a/hip/factorization/par_ilu_kernels.hip.cpp +++ b/hip/factorization/par_ilu_kernels.hip.cpp @@ -4,10 +4,8 @@ #include "core/factorization/par_ilu_kernels.hpp" - #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/memory.hpp" diff --git a/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp index b5612ea29c6..b4fdd7e6e6d 100644 --- a/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp +++ b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp @@ -2,24 +2,20 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - - #include - #include #include #include #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/hip/factorization/par_ilut_filter_kernels.hip.cpp b/hip/factorization/par_ilut_filter_kernels.hip.cpp index e6d0a6348cc..8f91e6f7087 100644 --- a/hip/factorization/par_ilut_filter_kernels.hip.cpp +++ b/hip/factorization/par_ilut_filter_kernels.hip.cpp @@ -2,21 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - - #include #include #include #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/hip/factorization/par_ilut_select_common.hip.cpp index 5486b3f5ba5..098ce5c9887 100644 --- a/hip/factorization/par_ilut_select_common.hip.cpp +++ b/hip/factorization/par_ilut_select_common.hip.cpp @@ -2,7 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause - // clang-format off // prevent compilation failure related to disappearing assert(...) statements #include "common/cuda_hip/base/runtime.hpp" @@ -11,7 +10,6 @@ #include "hip/factorization/par_ilut_select_common.hip.hpp" - #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" #include "hip/base/math.hip.hpp" diff --git a/hip/factorization/par_ilut_select_kernels.hip.cpp b/hip/factorization/par_ilut_select_kernels.hip.cpp index b259133b95d..55180bc3d05 100644 --- a/hip/factorization/par_ilut_select_kernels.hip.cpp +++ b/hip/factorization/par_ilut_select_kernels.hip.cpp @@ -2,19 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - - #include - #include #include #include - #include "common/cuda_hip/base/runtime.hpp" #include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" #include "hip/base/math.hip.hpp" #include "hip/components/atomic.hip.hpp" #include "hip/components/intrinsics.hip.hpp" diff --git a/hip/factorization/par_ilut_spgeam_kernels.hip.cpp b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp index df77b1ba7a2..200a16ea849 100644 --- a/hip/factorization/par_ilut_spgeam_kernels.hip.cpp +++ b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp @@ -2,19 +2,16 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - - #include #include #include #include #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/hip/factorization/par_ilut_sweep_kernels.hip.cpp b/hip/factorization/par_ilut_sweep_kernels.hip.cpp index 0f1e6455812..b3994706567 100644 --- a/hip/factorization/par_ilut_sweep_kernels.hip.cpp +++ b/hip/factorization/par_ilut_sweep_kernels.hip.cpp @@ -2,19 +2,16 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - - #include #include #include #include #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/components/memory.hpp" #include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp index de73576ffed..4b0e6799834 100644 --- a/hip/matrix/batch_csr_kernels.hip.cpp +++ b/hip/matrix/batch_csr_kernels.hip.cpp @@ -4,15 +4,12 @@ #include "core/matrix/batch_csr_kernels.hpp" - #include - #include #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp index 5d3b9d8cef9..328f268251f 100644 --- a/hip/matrix/batch_dense_kernels.hip.cpp +++ b/hip/matrix/batch_dense_kernels.hip.cpp @@ -4,15 +4,12 @@ #include "core/matrix/batch_dense_kernels.hpp" - #include - #include #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp index d415f114c3b..01294ac3d63 100644 --- a/hip/matrix/batch_ell_kernels.hip.cpp +++ b/hip/matrix/batch_ell_kernels.hip.cpp @@ -4,15 +4,12 @@ #include "core/matrix/batch_ell_kernels.hpp" - #include - #include #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp index 16a267d95b6..bb9f7912cd6 100644 --- a/hip/matrix/batch_struct.hip.hpp +++ b/hip/matrix/batch_struct.hip.hpp @@ -6,15 +6,12 @@ #define GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_ -#include "core/matrix/batch_struct.hpp" - - #include #include - #include "common/cuda_hip/base/types.hpp" #include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" namespace gko { diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp index 8f7a050ef87..fe78b938e3c 100644 --- a/hip/matrix/coo_kernels.hip.cpp +++ b/hip/matrix/coo_kernels.hip.cpp @@ -4,14 +4,12 @@ #include "core/matrix/coo_kernels.hpp" - #include #include #include #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp index 8b3579f049c..acd0b0144bb 100644 --- a/hip/matrix/csr_kernels.template.hip.cpp +++ b/hip/matrix/csr_kernels.template.hip.cpp @@ -4,10 +4,8 @@ #include "core/matrix/csr_kernels.hpp" - #include - #include #include #include @@ -16,7 +14,6 @@ #include #include - #include #include #include @@ -26,7 +23,6 @@ #include #include - #include "accessor/cuda_hip_helper.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/pointer_mode_guard.hpp" diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp index 8fed3c97c1b..82599050719 100644 --- a/hip/matrix/dense_kernels.hip.cpp +++ b/hip/matrix/dense_kernels.hip.cpp @@ -4,7 +4,6 @@ #include "core/matrix/dense_kernels.hpp" - #include #include #include @@ -16,7 +15,6 @@ #include #include - #include "common/cuda_hip/base/blas_bindings.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/pointer_mode_guard.hpp" diff --git a/hip/matrix/diagonal_kernels.hip.cpp b/hip/matrix/diagonal_kernels.hip.cpp index 01033004c6b..b9585db9b41 100644 --- a/hip/matrix/diagonal_kernels.hip.cpp +++ b/hip/matrix/diagonal_kernels.hip.cpp @@ -4,11 +4,9 @@ #include "core/matrix/diagonal_kernels.hpp" - #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index 4f1ff6a3539..cb8cca32d89 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -4,17 +4,14 @@ #include "core/matrix/ell_kernels.hpp" - #include - #include #include #include #include #include - #include "accessor/cuda_hip_helper.hpp" #include "accessor/reduced_row_major.hpp" #include "common/cuda_hip/base/config.hpp" diff --git a/hip/matrix/fbcsr_kernels.template.hip.cpp b/hip/matrix/fbcsr_kernels.template.hip.cpp index 0286aff0bba..c5d49215042 100644 --- a/hip/matrix/fbcsr_kernels.template.hip.cpp +++ b/hip/matrix/fbcsr_kernels.template.hip.cpp @@ -4,10 +4,8 @@ #include "core/matrix/fbcsr_kernels.hpp" - #include - #include #include #include @@ -16,14 +14,12 @@ #include #include - #include #include #include #include #include - #include "common/cuda_hip/base/blas_bindings.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/pointer_mode_guard.hpp" diff --git a/hip/matrix/fft_kernels.hip.cpp b/hip/matrix/fft_kernels.hip.cpp index 31e180b4414..6b14aaf067d 100644 --- a/hip/matrix/fft_kernels.hip.cpp +++ b/hip/matrix/fft_kernels.hip.cpp @@ -4,7 +4,6 @@ #include "core/matrix/fft_kernels.hpp" - #include @@ -19,7 +18,6 @@ #include #include - #include "common/cuda_hip/base/runtime.hpp" diff --git a/hip/matrix/fft_kernels_stub.hip.cpp b/hip/matrix/fft_kernels_stub.hip.cpp index f50bec4ff0b..210349e58e4 100644 --- a/hip/matrix/fft_kernels_stub.hip.cpp +++ b/hip/matrix/fft_kernels_stub.hip.cpp @@ -4,7 +4,6 @@ #include "core/matrix/fft_kernels.hpp" - #include #include #include diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp index f1e15c946e0..4caf83fdaa1 100644 --- a/hip/matrix/sellp_kernels.hip.cpp +++ b/hip/matrix/sellp_kernels.hip.cpp @@ -4,14 +4,12 @@ #include "core/matrix/sellp_kernels.hpp" - #include #include #include #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/hip/matrix/sparsity_csr_kernels.hip.cpp index 487b134d28a..7a7a4ba49d5 100644 --- a/hip/matrix/sparsity_csr_kernels.hip.cpp +++ b/hip/matrix/sparsity_csr_kernels.hip.cpp @@ -4,13 +4,10 @@ #include "core/matrix/sparsity_csr_kernels.hpp" - #include - #include - #include "accessor/cuda_hip_helper.hpp" #include "accessor/reduced_row_major.hpp" #include "common/cuda_hip/base/config.hpp" diff --git a/hip/multigrid/pgm_kernels.hip.cpp b/hip/multigrid/pgm_kernels.hip.cpp index 18c1f0957c4..da5890315bc 100644 --- a/hip/multigrid/pgm_kernels.hip.cpp +++ b/hip/multigrid/pgm_kernels.hip.cpp @@ -4,21 +4,17 @@ #include "core/multigrid/pgm_kernels.hpp" - #include - #include #include #include #include #include - #include #include - #include "common/cuda_hip/base/types.hpp" #include "hip/base/thrust.hip.hpp" diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp index f366636a48f..db6e5a27b58 100644 --- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp +++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp @@ -4,12 +4,10 @@ #include "core/preconditioner/batch_jacobi_kernels.hpp" - #include #include #include - #include "core/base/batch_struct.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" diff --git a/hip/preconditioner/isai_kernels.hip.cpp b/hip/preconditioner/isai_kernels.hip.cpp index 4eaf65cc438..d3c2bd0fb1d 100644 --- a/hip/preconditioner/isai_kernels.hip.cpp +++ b/hip/preconditioner/isai_kernels.hip.cpp @@ -4,12 +4,10 @@ #include "core/preconditioner/isai_kernels.hpp" - #include #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp index 0a78eac4145..0eccbb2d6eb 100644 --- a/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp @@ -2,13 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - #include - #include "core/matrix/dense_kernels.hpp" +#include "core/preconditioner/jacobi_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" diff --git a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp index 358c6f3b337..7e6311bcd52 100644 --- a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp +++ b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp @@ -2,18 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" +#include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/math.hip.hpp" diff --git a/hip/preconditioner/jacobi_generate_kernels.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.hip.cpp index 6365f6c132e..9f2d3238a83 100644 --- a/hip/preconditioner/jacobi_generate_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_generate_kernels.hip.cpp @@ -2,19 +2,16 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/components/fill_array_kernels.hpp" +#include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/math.hip.hpp" diff --git a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp index 4634f8a0c57..3685df4aa0e 100644 --- a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp +++ b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp @@ -2,18 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/components/fill_array_kernels.hpp" +#include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/math.hip.hpp" diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp index a3b2b7e5412..122e53f636d 100644 --- a/hip/preconditioner/jacobi_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_kernels.hip.cpp @@ -4,10 +4,8 @@ #include "core/preconditioner/jacobi_kernels.hpp" - #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp index 37b78f17469..d922d178f88 100644 --- a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp @@ -2,18 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" +#include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/math.hip.hpp" diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp index 421a32c3efc..baa847c58a5 100644 --- a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp @@ -2,17 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" +#include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/math.hip.hpp" diff --git a/hip/reorder/rcm_kernels.hip.cpp b/hip/reorder/rcm_kernels.hip.cpp index 9a5739064eb..9ac6e44e173 100644 --- a/hip/reorder/rcm_kernels.hip.cpp +++ b/hip/reorder/rcm_kernels.hip.cpp @@ -4,7 +4,6 @@ #include "core/reorder/rcm_kernels.hpp" - #include #include #include @@ -16,7 +15,6 @@ #include #include - #include #include #include @@ -24,7 +22,6 @@ #include #include - #include "common/cuda_hip/components/memory.hpp" #include "core/base/array_access.hpp" #include "hip/base/thrust.hip.hpp" diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index fdeb0580931..44e2f0f3c48 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -4,15 +4,12 @@ #include "core/solver/batch_bicgstab_kernels.hpp" - #include #include - #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index 47c2bc498eb..450d02a302c 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -4,15 +4,12 @@ #include "core/solver/batch_cg_kernels.hpp" - #include #include - #include #include - #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp index 2f2df4ddf84..fd046d000b4 100644 --- a/hip/solver/cb_gmres_kernels.hip.cpp +++ b/hip/solver/cb_gmres_kernels.hip.cpp @@ -4,16 +4,13 @@ #include "core/solver/cb_gmres_kernels.hpp" - #include - #include #include #include #include - #include "accessor/cuda_hip_helper.hpp" #include "accessor/range.hpp" #include "accessor/reduced_row_major.hpp" diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp index 9fac4be8547..ce5cd4192a9 100644 --- a/hip/solver/common_trs_kernels.hip.hpp +++ b/hip/solver/common_trs_kernels.hip.hpp @@ -20,7 +20,6 @@ #include #include - #include "common/cuda_hip/base/pointer_mode_guard.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp index b1ef414c091..c516597bd2b 100644 --- a/hip/solver/idr_kernels.hip.cpp +++ b/hip/solver/idr_kernels.hip.cpp @@ -4,15 +4,12 @@ #include "core/solver/idr_kernels.hpp" - #include #include - #include #include - #include "common/cuda_hip/base/blas_bindings.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/randlib_bindings.hpp" diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp index d355940a487..322c87d37b3 100644 --- a/hip/solver/lower_trs_kernels.hip.cpp +++ b/hip/solver/lower_trs_kernels.hip.cpp @@ -4,7 +4,6 @@ #include "core/solver/lower_trs_kernels.hpp" - #include @@ -19,7 +18,6 @@ #include #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/hip/solver/multigrid_kernels.hip.cpp b/hip/solver/multigrid_kernels.hip.cpp index f68105ba6d8..6e19606a78e 100644 --- a/hip/solver/multigrid_kernels.hip.cpp +++ b/hip/solver/multigrid_kernels.hip.cpp @@ -4,13 +4,11 @@ #include "core/solver/multigrid_kernels.hpp" - #include #include #include #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "core/base/array_access.hpp" diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp index 2a31e450d27..6be850959cb 100644 --- a/hip/solver/upper_trs_kernels.hip.cpp +++ b/hip/solver/upper_trs_kernels.hip.cpp @@ -4,7 +4,6 @@ #include "core/solver/upper_trs_kernels.hpp" - #include @@ -19,7 +18,6 @@ #include #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/hip/stop/criterion_kernels.hip.cpp b/hip/stop/criterion_kernels.hip.cpp index 3d24daa5bd5..8f856f0ed8d 100644 --- a/hip/stop/criterion_kernels.hip.cpp +++ b/hip/stop/criterion_kernels.hip.cpp @@ -4,12 +4,10 @@ #include "core/stop/criterion_kernels.hpp" - #include #include #include - #include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp index 7f2b0646ea2..eb6c89a2e2e 100644 --- a/hip/stop/residual_norm_kernels.hip.cpp +++ b/hip/stop/residual_norm_kernels.hip.cpp @@ -4,12 +4,10 @@ #include "core/stop/residual_norm_kernels.hpp" - #include #include #include - #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "core/base/array_access.hpp" diff --git a/hip/test/base/exception_helpers.hip.cpp b/hip/test/base/exception_helpers.hip.cpp index 5f2dd3cd881..85a28fc1c41 100644 --- a/hip/test/base/exception_helpers.hip.cpp +++ b/hip/test/base/exception_helpers.hip.cpp @@ -2,10 +2,9 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include + +#include #if HIP_VERSION >= 50200000 #include #include diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp index 266532823e7..55d8ffe5863 100644 --- a/hip/test/base/hip_executor.hip.cpp +++ b/hip/test/base/hip_executor.hip.cpp @@ -2,25 +2,20 @@ // // SPDX-License-Identifier: BSD-3-Clause - // clang-format off // prevent compilation failure related to disappearing assert(...) statements #include // clang-format on -#include - - #include #include - #include - #include #include +#include #include "common/cuda_hip/base/executor.hpp.inc" #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/test/base/hip_executor_topology.hip.cpp b/hip/test/base/hip_executor_topology.hip.cpp index 10ebac1bbc6..50111fd5712 100644 --- a/hip/test/base/hip_executor_topology.hip.cpp +++ b/hip/test/base/hip_executor_topology.hip.cpp @@ -2,20 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause - // clang-format off // prevent compilation failure related to disappearing assert(...) statements #include // clang-format on -#include - - #include #include #include +#include + #if defined(__unix__) || defined(__APPLE__) #include @@ -25,11 +23,9 @@ #include - #include #include - #include "hip/test/utils.hip.hpp" diff --git a/hip/test/base/index_set.cpp b/hip/test/base/index_set.cpp index fdca7ebb905..c34ff5693c2 100644 --- a/hip/test/base/index_set.cpp +++ b/hip/test/base/index_set.cpp @@ -2,20 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include +#include #include - #include "hip/test/utils.hip.hpp" diff --git a/hip/test/base/kernel_launch.hip.cpp b/hip/test/base/kernel_launch.hip.cpp index 4eea4805d87..4ab5bf12602 100644 --- a/hip/test/base/kernel_launch.hip.cpp +++ b/hip/test/base/kernel_launch.hip.cpp @@ -4,20 +4,16 @@ #include "common/unified/base/kernel_launch.hpp" - #include #include - #include - #include #include #include #include - #include "common/unified/base/kernel_launch_reduction.hpp" #include "common/unified/base/kernel_launch_solver.hpp" #include "core/base/array_access.hpp" diff --git a/hip/test/base/lin_op.cpp b/hip/test/base/lin_op.cpp index dbc0235f67e..939ad3046d0 100644 --- a/hip/test/base/lin_op.cpp +++ b/hip/test/base/lin_op.cpp @@ -2,11 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include +#include + namespace { diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp index f018c634a6a..1a882989854 100644 --- a/hip/test/base/math.hip.cpp +++ b/hip/test/base/math.hip.cpp @@ -2,30 +2,25 @@ // // SPDX-License-Identifier: BSD-3-Clause - // clang-format off // prevent compilation failure related to disappearing assert(...) statements #include // clang-format on -#include - +#include "hip/base/math.hip.hpp" #include #include #include - #include - #include #include - +#include #include "common/cuda_hip/base/types.hpp" -#include "hip/base/math.hip.hpp" #include "hip/test/utils.hip.hpp" diff --git a/hip/test/base/memory.cpp b/hip/test/base/memory.cpp index ece86d640ad..651630fce08 100644 --- a/hip/test/base/memory.cpp +++ b/hip/test/base/memory.cpp @@ -2,20 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include - #include - #include #include #include - +#include #include "hip/test/utils.hip.hpp" diff --git a/hip/test/base/scoped_device_id.hip.cpp b/hip/test/base/scoped_device_id.hip.cpp index 78d51fc989d..07c40214297 100644 --- a/hip/test/base/scoped_device_id.hip.cpp +++ b/hip/test/base/scoped_device_id.hip.cpp @@ -2,20 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause - // clang-format off // prevent compilation failure related to disappearing assert(...) statements #include // clang-format on -#include +#include "hip/base/scoped_device_id.hip.hpp" +#include #include - -#include "hip/base/scoped_device_id.hip.hpp" #include "hip/test/utils.hip.hpp" diff --git a/hip/test/components/cooperative_groups.hip.cpp b/hip/test/components/cooperative_groups.hip.cpp index f99b4eb8a87..06a104a8879 100644 --- a/hip/test/components/cooperative_groups.hip.cpp +++ b/hip/test/components/cooperative_groups.hip.cpp @@ -2,26 +2,23 @@ // // SPDX-License-Identifier: BSD-3-Clause - // clang-format off // TODO remove when the HIP includes are fixed #include // clang-format on +#include "common/cuda_hip/components/cooperative_groups.hpp" + #include #include - #include - #include #include - #include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" #include "hip/test/utils.hip.hpp" diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp index be18447a901..7fc3b9a173a 100644 --- a/hip/test/components/merging.hip.cpp +++ b/hip/test/components/merging.hip.cpp @@ -2,7 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause - // clang-format off // TODO remove when the HIP includes are fixed #include @@ -11,20 +10,16 @@ #include "hip/components/merging.hip.hpp" - #include #include #include #include - #include - #include #include - #include "common/cuda_hip/components/cooperative_groups.hpp" #include "hip/test/utils.hip.hpp" diff --git a/hip/test/components/searching.hip.cpp b/hip/test/components/searching.hip.cpp index 252e8841893..85c54075231 100644 --- a/hip/test/components/searching.hip.cpp +++ b/hip/test/components/searching.hip.cpp @@ -2,7 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause - // clang-format off // TODO remove when the HIP includes are fixed #include @@ -11,19 +10,15 @@ #include "hip/components/searching.hip.hpp" - #include #include #include - #include - #include #include - #include "common/cuda_hip/components/cooperative_groups.hpp" #include "hip/test/utils.hip.hpp" diff --git a/hip/test/components/sorting.hip.cpp b/hip/test/components/sorting.hip.cpp index 5cab0048a4b..79de1dc2269 100644 --- a/hip/test/components/sorting.hip.cpp +++ b/hip/test/components/sorting.hip.cpp @@ -4,18 +4,14 @@ #include "hip/components/sorting.hip.hpp" - #include #include - #include - #include #include - #include "hip/test/utils.hip.hpp" diff --git a/hip/test/matrix/fbcsr_kernels.cpp b/hip/test/matrix/fbcsr_kernels.cpp index 27ff7309ea4..0b4b16086ca 100644 --- a/hip/test/matrix/fbcsr_kernels.cpp +++ b/hip/test/matrix/fbcsr_kernels.cpp @@ -2,19 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/matrix/fbcsr_kernels.hpp" #include - #include - #include +#include - -#include "core/matrix/fbcsr_kernels.hpp" #include "core/test/matrix/fbcsr_sample.hpp" #include "core/test/utils.hpp" #include "core/test/utils/fb_matrix_generator.hpp" diff --git a/hip/test/matrix/fft_kernels.hip.cpp b/hip/test/matrix/fft_kernels.hip.cpp index 366e04c3290..d3ec4d1c58a 100644 --- a/hip/test/matrix/fft_kernels.hip.cpp +++ b/hip/test/matrix/fft_kernels.hip.cpp @@ -2,10 +2,9 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include + +#include #if HIP_VERSION >= 50200000 #include #else @@ -15,7 +14,6 @@ #include - #include #include diff --git a/hip/test/solver/lower_trs_kernels.cpp b/hip/test/solver/lower_trs_kernels.cpp index c2ad9cda357..d249ae3cca3 100644 --- a/hip/test/solver/lower_trs_kernels.cpp +++ b/hip/test/solver/lower_trs_kernels.cpp @@ -2,21 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/solver/lower_trs_kernels.hpp" + #include #include - #include - #include #include #include #include #include - -#include "core/solver/lower_trs_kernels.hpp" #include "hip/test/utils.hip.hpp" diff --git a/hip/test/solver/upper_trs_kernels.cpp b/hip/test/solver/upper_trs_kernels.cpp index c161bfcd32f..fbe8259bad6 100644 --- a/hip/test/solver/upper_trs_kernels.cpp +++ b/hip/test/solver/upper_trs_kernels.cpp @@ -2,21 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/solver/upper_trs_kernels.hpp" + #include #include - #include - #include #include #include #include #include - -#include "core/solver/upper_trs_kernels.hpp" #include "hip/test/utils.hip.hpp" diff --git a/hip/test/utils.hip.hpp b/hip/test/utils.hip.hpp index 112ae6c24e1..3f5e1f1f858 100644 --- a/hip/test/utils.hip.hpp +++ b/hip/test/utils.hip.hpp @@ -6,14 +6,11 @@ #define GKO_HIP_TEST_UTILS_HIP_HPP_ -#include "core/test/utils.hpp" - - #include #include - #include "core/test/gtest/resources.hpp" +#include "core/test/utils.hpp" #include "hip/base/device.hpp" diff --git a/hip/test/utils/assertions_test.cpp b/hip/test/utils/assertions_test.cpp index 17363313ab5..582967469ec 100644 --- a/hip/test/utils/assertions_test.cpp +++ b/hip/test/utils/assertions_test.cpp @@ -4,14 +4,11 @@ #include "core/test/utils/assertions.hpp" - #include - #include #include - #include "hip/test/utils.hip.hpp" diff --git a/include/ginkgo/core/base/abstract_factory.hpp b/include/ginkgo/core/base/abstract_factory.hpp index 8ba10a648d1..5c799ab58f1 100644 --- a/include/ginkgo/core/base/abstract_factory.hpp +++ b/include/ginkgo/core/base/abstract_factory.hpp @@ -8,7 +8,6 @@ #include - #include diff --git a/include/ginkgo/core/base/array.hpp b/include/ginkgo/core/base/array.hpp index 5d88206cf2b..e0cf8c22ab3 100644 --- a/include/ginkgo/core/base/array.hpp +++ b/include/ginkgo/core/base/array.hpp @@ -12,7 +12,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/base/batch_dim.hpp b/include/ginkgo/core/base/batch_dim.hpp index de0699d6876..e34c9a4c2c4 100644 --- a/include/ginkgo/core/base/batch_dim.hpp +++ b/include/ginkgo/core/base/batch_dim.hpp @@ -8,7 +8,6 @@ #include - #include #include diff --git a/include/ginkgo/core/base/batch_lin_op.hpp b/include/ginkgo/core/base/batch_lin_op.hpp index 9de2db6b724..701d25bbcb3 100644 --- a/include/ginkgo/core/base/batch_lin_op.hpp +++ b/include/ginkgo/core/base/batch_lin_op.hpp @@ -10,7 +10,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 66ea677fca9..d04e9562fce 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -9,7 +9,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/base/combination.hpp b/include/ginkgo/core/base/combination.hpp index 8992394ea0d..f3cdea82dcb 100644 --- a/include/ginkgo/core/base/combination.hpp +++ b/include/ginkgo/core/base/combination.hpp @@ -8,7 +8,6 @@ #include - #include diff --git a/include/ginkgo/core/base/composition.hpp b/include/ginkgo/core/base/composition.hpp index 62b854264a2..e151e121b56 100644 --- a/include/ginkgo/core/base/composition.hpp +++ b/include/ginkgo/core/base/composition.hpp @@ -8,7 +8,6 @@ #include - #include #include diff --git a/include/ginkgo/core/base/dense_cache.hpp b/include/ginkgo/core/base/dense_cache.hpp index a9dd8b57ba2..dd2918ab6a7 100644 --- a/include/ginkgo/core/base/dense_cache.hpp +++ b/include/ginkgo/core/base/dense_cache.hpp @@ -8,7 +8,6 @@ #include - #include diff --git a/include/ginkgo/core/base/device.hpp b/include/ginkgo/core/base/device.hpp index 90240b6791a..5b82d79f0b8 100644 --- a/include/ginkgo/core/base/device.hpp +++ b/include/ginkgo/core/base/device.hpp @@ -11,7 +11,6 @@ #include #include - #include diff --git a/include/ginkgo/core/base/dim.hpp b/include/ginkgo/core/base/dim.hpp index 10a4a90fa2d..ffa38aa6a76 100644 --- a/include/ginkgo/core/base/dim.hpp +++ b/include/ginkgo/core/base/dim.hpp @@ -8,7 +8,6 @@ #include - #include diff --git a/include/ginkgo/core/base/exception.hpp b/include/ginkgo/core/base/exception.hpp index 678c714dada..febc5e17034 100644 --- a/include/ginkgo/core/base/exception.hpp +++ b/include/ginkgo/core/base/exception.hpp @@ -9,7 +9,6 @@ #include #include - #include diff --git a/include/ginkgo/core/base/exception_helpers.hpp b/include/ginkgo/core/base/exception_helpers.hpp index 0482a50a334..f0104ba1a7c 100644 --- a/include/ginkgo/core/base/exception_helpers.hpp +++ b/include/ginkgo/core/base/exception_helpers.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 761405c0b3d..0d592485c1c 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -17,7 +17,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/base/index_set.hpp b/include/ginkgo/core/base/index_set.hpp index 8a0a60972a7..260896d6b2f 100644 --- a/include/ginkgo/core/base/index_set.hpp +++ b/include/ginkgo/core/base/index_set.hpp @@ -11,7 +11,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/base/intrinsics.hpp b/include/ginkgo/core/base/intrinsics.hpp index 941a32458c7..37e7f361781 100644 --- a/include/ginkgo/core/base/intrinsics.hpp +++ b/include/ginkgo/core/base/intrinsics.hpp @@ -8,7 +8,6 @@ #include - #include diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp index f9f60f9c3c4..26e1c1b9baa 100644 --- a/include/ginkgo/core/base/lin_op.hpp +++ b/include/ginkgo/core/base/lin_op.hpp @@ -10,7 +10,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/base/machine_topology.hpp b/include/ginkgo/core/base/machine_topology.hpp index 453281395ef..0a1fff15268 100644 --- a/include/ginkgo/core/base/machine_topology.hpp +++ b/include/ginkgo/core/base/machine_topology.hpp @@ -16,7 +16,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 30b0da475d0..42eff5a5d40 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -13,7 +13,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/base/matrix_assembly_data.hpp b/include/ginkgo/core/base/matrix_assembly_data.hpp index 4eeed1fd702..6993f2004f2 100644 --- a/include/ginkgo/core/base/matrix_assembly_data.hpp +++ b/include/ginkgo/core/base/matrix_assembly_data.hpp @@ -12,7 +12,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/base/matrix_data.hpp b/include/ginkgo/core/base/matrix_data.hpp index 954bf678b18..0edb39a9c6d 100644 --- a/include/ginkgo/core/base/matrix_data.hpp +++ b/include/ginkgo/core/base/matrix_data.hpp @@ -11,7 +11,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index 0909dce5cea..64c04e1805a 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -10,7 +10,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/base/mtx_io.hpp b/include/ginkgo/core/base/mtx_io.hpp index 14c04244df3..102cb446cc4 100644 --- a/include/ginkgo/core/base/mtx_io.hpp +++ b/include/ginkgo/core/base/mtx_io.hpp @@ -8,7 +8,6 @@ #include - #include diff --git a/include/ginkgo/core/base/perturbation.hpp b/include/ginkgo/core/base/perturbation.hpp index 6db017ac5b4..b6f2f95c008 100644 --- a/include/ginkgo/core/base/perturbation.hpp +++ b/include/ginkgo/core/base/perturbation.hpp @@ -8,7 +8,6 @@ #include - #include #include diff --git a/include/ginkgo/core/base/polymorphic_object.hpp b/include/ginkgo/core/base/polymorphic_object.hpp index e7e3d4b154b..3a17cfd27ef 100644 --- a/include/ginkgo/core/base/polymorphic_object.hpp +++ b/include/ginkgo/core/base/polymorphic_object.hpp @@ -9,7 +9,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp index 322ac246385..680bc47bcb6 100644 --- a/include/ginkgo/core/base/range.hpp +++ b/include/ginkgo/core/base/range.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/base/range_accessors.hpp b/include/ginkgo/core/base/range_accessors.hpp index 9046d33cf85..56335b8dd97 100644 --- a/include/ginkgo/core/base/range_accessors.hpp +++ b/include/ginkgo/core/base/range_accessors.hpp @@ -8,7 +8,6 @@ #include - #include #include diff --git a/include/ginkgo/core/base/segmented_array.hpp b/include/ginkgo/core/base/segmented_array.hpp index a31273c0f06..49a7e6f9d38 100644 --- a/include/ginkgo/core/base/segmented_array.hpp +++ b/include/ginkgo/core/base/segmented_array.hpp @@ -5,7 +5,6 @@ #pragma once #include - #include #include diff --git a/include/ginkgo/core/base/temporary_clone.hpp b/include/ginkgo/core/base/temporary_clone.hpp index baa348a34c9..2e4cc40dcf7 100644 --- a/include/ginkgo/core/base/temporary_clone.hpp +++ b/include/ginkgo/core/base/temporary_clone.hpp @@ -10,7 +10,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/base/temporary_conversion.hpp b/include/ginkgo/core/base/temporary_conversion.hpp index 6b8528f11ec..d0e14806719 100644 --- a/include/ginkgo/core/base/temporary_conversion.hpp +++ b/include/ginkgo/core/base/temporary_conversion.hpp @@ -10,7 +10,6 @@ #include #include - #include #include diff --git a/include/ginkgo/core/base/timer.hpp b/include/ginkgo/core/base/timer.hpp index 8008cecfb94..6f647330126 100644 --- a/include/ginkgo/core/base/timer.hpp +++ b/include/ginkgo/core/base/timer.hpp @@ -8,7 +8,6 @@ #include - #include diff --git a/include/ginkgo/core/base/utils_helper.hpp b/include/ginkgo/core/base/utils_helper.hpp index 62ffa6be554..faa74974703 100644 --- a/include/ginkgo/core/base/utils_helper.hpp +++ b/include/ginkgo/core/base/utils_helper.hpp @@ -10,7 +10,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/base/version.hpp b/include/ginkgo/core/base/version.hpp index de2f6abe485..9fad9430527 100644 --- a/include/ginkgo/core/base/version.hpp +++ b/include/ginkgo/core/base/version.hpp @@ -8,7 +8,6 @@ #include - #include #include diff --git a/include/ginkgo/core/config/config.hpp b/include/ginkgo/core/config/config.hpp index 4bbf58d8584..27c08caa3a3 100644 --- a/include/ginkgo/core/config/config.hpp +++ b/include/ginkgo/core/config/config.hpp @@ -10,7 +10,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/config/registry.hpp b/include/ginkgo/core/config/registry.hpp index 9a6dc23b6ae..1e5073f8c42 100644 --- a/include/ginkgo/core/config/registry.hpp +++ b/include/ginkgo/core/config/registry.hpp @@ -13,7 +13,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/distributed/lin_op.hpp b/include/ginkgo/core/distributed/lin_op.hpp index a84425be465..144c9654eb5 100644 --- a/include/ginkgo/core/distributed/lin_op.hpp +++ b/include/ginkgo/core/distributed/lin_op.hpp @@ -10,7 +10,6 @@ #include #include - #include diff --git a/include/ginkgo/core/distributed/polymorphic_object.hpp b/include/ginkgo/core/distributed/polymorphic_object.hpp index 5cfe55049e6..553dc4d2d19 100644 --- a/include/ginkgo/core/distributed/polymorphic_object.hpp +++ b/include/ginkgo/core/distributed/polymorphic_object.hpp @@ -9,7 +9,6 @@ #include #include - #include diff --git a/include/ginkgo/core/factorization/cholesky.hpp b/include/ginkgo/core/factorization/cholesky.hpp index c5b0c6c2e58..0b3a7fb0caf 100644 --- a/include/ginkgo/core/factorization/cholesky.hpp +++ b/include/ginkgo/core/factorization/cholesky.hpp @@ -4,7 +4,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/factorization/ic.hpp b/include/ginkgo/core/factorization/ic.hpp index 2b2f213341a..616360ce039 100644 --- a/include/ginkgo/core/factorization/ic.hpp +++ b/include/ginkgo/core/factorization/ic.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/factorization/ilu.hpp b/include/ginkgo/core/factorization/ilu.hpp index 96efcd1f4ba..80f11ab7b6f 100644 --- a/include/ginkgo/core/factorization/ilu.hpp +++ b/include/ginkgo/core/factorization/ilu.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/factorization/lu.hpp b/include/ginkgo/core/factorization/lu.hpp index c77fd48f6bb..d00f5a111b3 100644 --- a/include/ginkgo/core/factorization/lu.hpp +++ b/include/ginkgo/core/factorization/lu.hpp @@ -4,7 +4,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/factorization/par_ic.hpp b/include/ginkgo/core/factorization/par_ic.hpp index 35cf01f3c79..b5f14a997b4 100644 --- a/include/ginkgo/core/factorization/par_ic.hpp +++ b/include/ginkgo/core/factorization/par_ic.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/factorization/par_ict.hpp b/include/ginkgo/core/factorization/par_ict.hpp index 904878e2a0a..bc2e38eadf4 100644 --- a/include/ginkgo/core/factorization/par_ict.hpp +++ b/include/ginkgo/core/factorization/par_ict.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/factorization/par_ilu.hpp b/include/ginkgo/core/factorization/par_ilu.hpp index d147d912749..88d183a939c 100644 --- a/include/ginkgo/core/factorization/par_ilu.hpp +++ b/include/ginkgo/core/factorization/par_ilu.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/factorization/par_ilut.hpp b/include/ginkgo/core/factorization/par_ilut.hpp index 166dc04a973..c73e3a1b905 100644 --- a/include/ginkgo/core/factorization/par_ilut.hpp +++ b/include/ginkgo/core/factorization/par_ilut.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/log/batch_logger.hpp b/include/ginkgo/core/log/batch_logger.hpp index 5043c1a0841..16b3c26aa20 100644 --- a/include/ginkgo/core/log/batch_logger.hpp +++ b/include/ginkgo/core/log/batch_logger.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/log/convergence.hpp b/include/ginkgo/core/log/convergence.hpp index 7327f7ff815..767146623a3 100644 --- a/include/ginkgo/core/log/convergence.hpp +++ b/include/ginkgo/core/log/convergence.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp index 7f7351addf5..907bc418906 100644 --- a/include/ginkgo/core/log/logger.hpp +++ b/include/ginkgo/core/log/logger.hpp @@ -12,7 +12,6 @@ #include #include - #include #include diff --git a/include/ginkgo/core/log/papi.hpp b/include/ginkgo/core/log/papi.hpp index 2595a574a05..a395a7b7108 100644 --- a/include/ginkgo/core/log/papi.hpp +++ b/include/ginkgo/core/log/papi.hpp @@ -12,12 +12,12 @@ #if GKO_HAVE_PAPI_SDE -#include #include #include #include #include +#include #include #include diff --git a/include/ginkgo/core/log/performance_hint.hpp b/include/ginkgo/core/log/performance_hint.hpp index 035dc690f7a..1a693ae184b 100644 --- a/include/ginkgo/core/log/performance_hint.hpp +++ b/include/ginkgo/core/log/performance_hint.hpp @@ -10,7 +10,6 @@ #include #include - #include diff --git a/include/ginkgo/core/log/profiler_hook.hpp b/include/ginkgo/core/log/profiler_hook.hpp index 1821d3a8f64..ce5e8831f1c 100644 --- a/include/ginkgo/core/log/profiler_hook.hpp +++ b/include/ginkgo/core/log/profiler_hook.hpp @@ -9,7 +9,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/log/record.hpp b/include/ginkgo/core/log/record.hpp index afeb1f3b973..41bfe245dc4 100644 --- a/include/ginkgo/core/log/record.hpp +++ b/include/ginkgo/core/log/record.hpp @@ -9,7 +9,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/log/stream.hpp b/include/ginkgo/core/log/stream.hpp index 120cbd84a59..83ef8b2e607 100644 --- a/include/ginkgo/core/log/stream.hpp +++ b/include/ginkgo/core/log/stream.hpp @@ -9,7 +9,6 @@ #include #include - #include diff --git a/include/ginkgo/core/matrix/batch_csr.hpp b/include/ginkgo/core/matrix/batch_csr.hpp index 9e1ea9283e5..e431454063d 100644 --- a/include/ginkgo/core/matrix/batch_csr.hpp +++ b/include/ginkgo/core/matrix/batch_csr.hpp @@ -9,7 +9,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index fba9dbe1514..5ea7c3ee128 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -9,7 +9,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp index a729f54191b..b760cee795a 100644 --- a/include/ginkgo/core/matrix/batch_ell.hpp +++ b/include/ginkgo/core/matrix/batch_ell.hpp @@ -9,7 +9,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 46728246329..d7e9b1a10e0 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -9,7 +9,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp index 3cd36658f4e..5e995cb0ba0 100644 --- a/include/ginkgo/core/matrix/hybrid.hpp +++ b/include/ginkgo/core/matrix/hybrid.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/matrix/permutation.hpp b/include/ginkgo/core/matrix/permutation.hpp index b15d14e4b83..5549b75f694 100644 --- a/include/ginkgo/core/matrix/permutation.hpp +++ b/include/ginkgo/core/matrix/permutation.hpp @@ -11,7 +11,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/matrix/row_gatherer.hpp b/include/ginkgo/core/matrix/row_gatherer.hpp index ebc4f175a17..bf55f03bdb0 100644 --- a/include/ginkgo/core/matrix/row_gatherer.hpp +++ b/include/ginkgo/core/matrix/row_gatherer.hpp @@ -11,7 +11,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/matrix/scaled_permutation.hpp b/include/ginkgo/core/matrix/scaled_permutation.hpp index 88dff395ab6..8f48bb38f88 100644 --- a/include/ginkgo/core/matrix/scaled_permutation.hpp +++ b/include/ginkgo/core/matrix/scaled_permutation.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/matrix/sparsity_csr.hpp b/include/ginkgo/core/matrix/sparsity_csr.hpp index 8dfe8b06713..0e6aa98f5ae 100644 --- a/include/ginkgo/core/matrix/sparsity_csr.hpp +++ b/include/ginkgo/core/matrix/sparsity_csr.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/multigrid/fixed_coarsening.hpp b/include/ginkgo/core/multigrid/fixed_coarsening.hpp index 2ab3211b609..86c21acba39 100644 --- a/include/ginkgo/core/multigrid/fixed_coarsening.hpp +++ b/include/ginkgo/core/multigrid/fixed_coarsening.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/multigrid/multigrid_level.hpp b/include/ginkgo/core/multigrid/multigrid_level.hpp index e52122b6bed..7c5b7e09684 100644 --- a/include/ginkgo/core/multigrid/multigrid_level.hpp +++ b/include/ginkgo/core/multigrid/multigrid_level.hpp @@ -9,7 +9,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/multigrid/pgm.hpp b/include/ginkgo/core/multigrid/pgm.hpp index 99b0856e819..d07001be2f1 100644 --- a/include/ginkgo/core/multigrid/pgm.hpp +++ b/include/ginkgo/core/multigrid/pgm.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/preconditioner/ic.hpp b/include/ginkgo/core/preconditioner/ic.hpp index 5cb1ad201e4..f78e00eea09 100644 --- a/include/ginkgo/core/preconditioner/ic.hpp +++ b/include/ginkgo/core/preconditioner/ic.hpp @@ -9,7 +9,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/preconditioner/ilu.hpp b/include/ginkgo/core/preconditioner/ilu.hpp index 816f6e600cb..869681fc547 100644 --- a/include/ginkgo/core/preconditioner/ilu.hpp +++ b/include/ginkgo/core/preconditioner/ilu.hpp @@ -9,7 +9,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/preconditioner/isai.hpp b/include/ginkgo/core/preconditioner/isai.hpp index 3080815f1f8..e17bff28bc7 100644 --- a/include/ginkgo/core/preconditioner/isai.hpp +++ b/include/ginkgo/core/preconditioner/isai.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/preconditioner/utils.hpp b/include/ginkgo/core/preconditioner/utils.hpp index 0ef114fcea3..1e3f35c8ada 100644 --- a/include/ginkgo/core/preconditioner/utils.hpp +++ b/include/ginkgo/core/preconditioner/utils.hpp @@ -8,7 +8,6 @@ #include - #include diff --git a/include/ginkgo/core/reorder/amd.hpp b/include/ginkgo/core/reorder/amd.hpp index 764a5426922..9dbffaa1c8c 100644 --- a/include/ginkgo/core/reorder/amd.hpp +++ b/include/ginkgo/core/reorder/amd.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/reorder/mc64.hpp b/include/ginkgo/core/reorder/mc64.hpp index afef9639a6e..b2c1fd1a644 100644 --- a/include/ginkgo/core/reorder/mc64.hpp +++ b/include/ginkgo/core/reorder/mc64.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/reorder/nested_dissection.hpp b/include/ginkgo/core/reorder/nested_dissection.hpp index 89563380cb3..735b56cd354 100644 --- a/include/ginkgo/core/reorder/nested_dissection.hpp +++ b/include/ginkgo/core/reorder/nested_dissection.hpp @@ -15,7 +15,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/reorder/rcm.hpp b/include/ginkgo/core/reorder/rcm.hpp index 09f11d90189..589d38e29d1 100644 --- a/include/ginkgo/core/reorder/rcm.hpp +++ b/include/ginkgo/core/reorder/rcm.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/reorder/reordering_base.hpp b/include/ginkgo/core/reorder/reordering_base.hpp index 83a2dd1886b..8dde7a6734f 100644 --- a/include/ginkgo/core/reorder/reordering_base.hpp +++ b/include/ginkgo/core/reorder/reordering_base.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/solver/batch_bicgstab.hpp b/include/ginkgo/core/solver/batch_bicgstab.hpp index 50015b49c45..bb287b17a53 100644 --- a/include/ginkgo/core/solver/batch_bicgstab.hpp +++ b/include/ginkgo/core/solver/batch_bicgstab.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/solver/batch_cg.hpp b/include/ginkgo/core/solver/batch_cg.hpp index a6cceebdb09..677936aa397 100644 --- a/include/ginkgo/core/solver/batch_cg.hpp +++ b/include/ginkgo/core/solver/batch_cg.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/solver/bicg.hpp b/include/ginkgo/core/solver/bicg.hpp index cf428e8c74f..9f1ef54cc34 100644 --- a/include/ginkgo/core/solver/bicg.hpp +++ b/include/ginkgo/core/solver/bicg.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/solver/bicgstab.hpp b/include/ginkgo/core/solver/bicgstab.hpp index 38382670597..a57a6c27aa4 100644 --- a/include/ginkgo/core/solver/bicgstab.hpp +++ b/include/ginkgo/core/solver/bicgstab.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/solver/cb_gmres.hpp b/include/ginkgo/core/solver/cb_gmres.hpp index 60d5cd32b4d..976712cd673 100644 --- a/include/ginkgo/core/solver/cb_gmres.hpp +++ b/include/ginkgo/core/solver/cb_gmres.hpp @@ -9,7 +9,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/solver/cg.hpp b/include/ginkgo/core/solver/cg.hpp index 38acccf9597..9d850ecbe6d 100644 --- a/include/ginkgo/core/solver/cg.hpp +++ b/include/ginkgo/core/solver/cg.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/solver/cgs.hpp b/include/ginkgo/core/solver/cgs.hpp index d930de00736..bde23d76910 100644 --- a/include/ginkgo/core/solver/cgs.hpp +++ b/include/ginkgo/core/solver/cgs.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/solver/fcg.hpp b/include/ginkgo/core/solver/fcg.hpp index ff030191225..4577dd1b1d4 100644 --- a/include/ginkgo/core/solver/fcg.hpp +++ b/include/ginkgo/core/solver/fcg.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/solver/gcr.hpp b/include/ginkgo/core/solver/gcr.hpp index 7b8e1c85fa8..62ce9c9c93c 100644 --- a/include/ginkgo/core/solver/gcr.hpp +++ b/include/ginkgo/core/solver/gcr.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/solver/gmres.hpp b/include/ginkgo/core/solver/gmres.hpp index fa2af094b6a..57bbca0b529 100644 --- a/include/ginkgo/core/solver/gmres.hpp +++ b/include/ginkgo/core/solver/gmres.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/solver/idr.hpp b/include/ginkgo/core/solver/idr.hpp index 78f95ba38c5..9f167d9b2eb 100644 --- a/include/ginkgo/core/solver/idr.hpp +++ b/include/ginkgo/core/solver/idr.hpp @@ -10,7 +10,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/solver/ir.hpp b/include/ginkgo/core/solver/ir.hpp index 9ac7acfaa91..91949261a79 100644 --- a/include/ginkgo/core/solver/ir.hpp +++ b/include/ginkgo/core/solver/ir.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/solver/multigrid.hpp b/include/ginkgo/core/solver/multigrid.hpp index 7a4e8b83be4..2d0278b538e 100644 --- a/include/ginkgo/core/solver/multigrid.hpp +++ b/include/ginkgo/core/solver/multigrid.hpp @@ -11,7 +11,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/solver/solver_base.hpp b/include/ginkgo/core/solver/solver_base.hpp index 43a941f6374..159ad2c15a7 100644 --- a/include/ginkgo/core/solver/solver_base.hpp +++ b/include/ginkgo/core/solver/solver_base.hpp @@ -10,7 +10,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/solver/triangular.hpp b/include/ginkgo/core/solver/triangular.hpp index 36e6c483c69..2d42e3bb97a 100644 --- a/include/ginkgo/core/solver/triangular.hpp +++ b/include/ginkgo/core/solver/triangular.hpp @@ -9,7 +9,6 @@ #include #include - #include #include #include diff --git a/include/ginkgo/core/solver/workspace.hpp b/include/ginkgo/core/solver/workspace.hpp index e40eccbb039..e169a7caf1c 100644 --- a/include/ginkgo/core/solver/workspace.hpp +++ b/include/ginkgo/core/solver/workspace.hpp @@ -8,7 +8,6 @@ #include - #include diff --git a/include/ginkgo/core/stop/combined.hpp b/include/ginkgo/core/stop/combined.hpp index 5f01e499511..62451538431 100644 --- a/include/ginkgo/core/stop/combined.hpp +++ b/include/ginkgo/core/stop/combined.hpp @@ -8,7 +8,6 @@ #include - #include diff --git a/include/ginkgo/core/stop/residual_norm.hpp b/include/ginkgo/core/stop/residual_norm.hpp index 273b28a5a35..6ee3c843e6a 100644 --- a/include/ginkgo/core/stop/residual_norm.hpp +++ b/include/ginkgo/core/stop/residual_norm.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/include/ginkgo/core/stop/time.hpp b/include/ginkgo/core/stop/time.hpp index ec734324985..a41b9c49499 100644 --- a/include/ginkgo/core/stop/time.hpp +++ b/include/ginkgo/core/stop/time.hpp @@ -8,7 +8,6 @@ #include - #include diff --git a/include/ginkgo/extensions/config/json_config.hpp b/include/ginkgo/extensions/config/json_config.hpp index d21a3623f46..f8c3cfd5860 100644 --- a/include/ginkgo/extensions/config/json_config.hpp +++ b/include/ginkgo/extensions/config/json_config.hpp @@ -10,10 +10,8 @@ #include #include - #include - #include diff --git a/include/ginkgo/extensions/kokkos/spaces.hpp b/include/ginkgo/extensions/kokkos/spaces.hpp index 6875f931152..1eb4fada3d1 100644 --- a/include/ginkgo/extensions/kokkos/spaces.hpp +++ b/include/ginkgo/extensions/kokkos/spaces.hpp @@ -7,7 +7,6 @@ #include - #include #include #include diff --git a/include/ginkgo/extensions/kokkos/types.hpp b/include/ginkgo/extensions/kokkos/types.hpp index 88362f317b1..d595461e409 100644 --- a/include/ginkgo/extensions/kokkos/types.hpp +++ b/include/ginkgo/extensions/kokkos/types.hpp @@ -6,11 +6,8 @@ #define GINKGO_EXTENSIONS_KOKKOS_TYPES_HPP #include - - #include - #include #include #include diff --git a/omp/base/batch_multi_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp index 9dc8d266924..395bf96cc7a 100644 --- a/omp/base/batch_multi_vector_kernels.cpp +++ b/omp/base/batch_multi_vector_kernels.cpp @@ -4,15 +4,12 @@ #include "core/base/batch_multi_vector_kernels.hpp" - #include - #include #include #include - #include "core/components/prefix_sum_kernels.hpp" #include "reference/base/batch_struct.hpp" diff --git a/omp/base/device_matrix_data_kernels.cpp b/omp/base/device_matrix_data_kernels.cpp index e8330ce589b..bce89e2f409 100644 --- a/omp/base/device_matrix_data_kernels.cpp +++ b/omp/base/device_matrix_data_kernels.cpp @@ -4,13 +4,10 @@ #include "core/base/device_matrix_data_kernels.hpp" - #include - #include - #include "core/base/allocator.hpp" #include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" diff --git a/omp/base/executor.cpp b/omp/base/executor.cpp index 98ef2d528ae..5e846946e5e 100644 --- a/omp/base/executor.cpp +++ b/omp/base/executor.cpp @@ -4,7 +4,6 @@ #include "ginkgo/core/base/executor.hpp" - #include diff --git a/omp/base/index_set_kernels.cpp b/omp/base/index_set_kernels.cpp index fbfa04a93b4..6dca856b96f 100644 --- a/omp/base/index_set_kernels.cpp +++ b/omp/base/index_set_kernels.cpp @@ -4,18 +4,15 @@ #include "core/base/index_set_kernels.hpp" - #include #include #include #include - #include #include #include - #include "core/base/allocator.hpp" diff --git a/omp/base/kernel_launch.hpp b/omp/base/kernel_launch.hpp index a5fcc32bffc..ac7131fea86 100644 --- a/omp/base/kernel_launch.hpp +++ b/omp/base/kernel_launch.hpp @@ -10,7 +10,6 @@ #include - #include "core/synthesizer/implementation_selection.hpp" diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index c7ef7a38220..bc489cb78dc 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -10,7 +10,6 @@ #include - #include diff --git a/omp/base/scoped_device_id.cpp b/omp/base/scoped_device_id.cpp index 9f74a43eb50..6e61d1b63bc 100644 --- a/omp/base/scoped_device_id.cpp +++ b/omp/base/scoped_device_id.cpp @@ -5,7 +5,6 @@ #include #include - #include "core/base/noop_scoped_device_id_guard.hpp" diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp index 01094a8a8dc..c3580cd36bb 100644 --- a/omp/components/atomic.hpp +++ b/omp/components/atomic.hpp @@ -8,7 +8,6 @@ #include - #include #include diff --git a/omp/components/csr_spgeam.hpp b/omp/components/csr_spgeam.hpp index 8ff417df442..e4b3b9b6e51 100644 --- a/omp/components/csr_spgeam.hpp +++ b/omp/components/csr_spgeam.hpp @@ -8,10 +8,8 @@ #include - #include - #include "core/base/utils.hpp" diff --git a/omp/components/matrix_operations.hpp b/omp/components/matrix_operations.hpp index 749bb754676..522915bf05b 100644 --- a/omp/components/matrix_operations.hpp +++ b/omp/components/matrix_operations.hpp @@ -8,7 +8,6 @@ #include - #include diff --git a/omp/components/prefix_sum_kernels.cpp b/omp/components/prefix_sum_kernels.cpp index 231acd52685..08d184b7616 100644 --- a/omp/components/prefix_sum_kernels.cpp +++ b/omp/components/prefix_sum_kernels.cpp @@ -4,14 +4,11 @@ #include "core/components/prefix_sum_kernels.hpp" - #include #include - #include - #include "core/base/allocator.hpp" diff --git a/omp/components/sort_small.hpp b/omp/components/sort_small.hpp index b862dab8457..12128576a53 100644 --- a/omp/components/sort_small.hpp +++ b/omp/components/sort_small.hpp @@ -8,7 +8,6 @@ #include - #include diff --git a/omp/distributed/index_map_kernels.cpp b/omp/distributed/index_map_kernels.cpp index 02ae63261a0..b01dab9cb33 100644 --- a/omp/distributed/index_map_kernels.cpp +++ b/omp/distributed/index_map_kernels.cpp @@ -4,13 +4,10 @@ #include "core/distributed/index_map_kernels.hpp" - #include - #include - #include "core/base/allocator.hpp" #include "core/base/device_matrix_data_kernels.hpp" #include "core/base/iterator_factory.hpp" diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp index 9f7b5594fa7..2f36ec4a778 100644 --- a/omp/distributed/matrix_kernels.cpp +++ b/omp/distributed/matrix_kernels.cpp @@ -4,13 +4,10 @@ #include "core/distributed/matrix_kernels.hpp" - #include - #include - #include "core/base/allocator.hpp" #include "core/base/device_matrix_data_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" diff --git a/omp/distributed/partition_helpers_kernels.cpp b/omp/distributed/partition_helpers_kernels.cpp index 70d01d18368..ceae3e17679 100644 --- a/omp/distributed/partition_helpers_kernels.cpp +++ b/omp/distributed/partition_helpers_kernels.cpp @@ -4,7 +4,6 @@ #include "core/distributed/partition_helpers_kernels.hpp" - #include "core/base/iterator_factory.hpp" diff --git a/omp/distributed/partition_kernels.cpp b/omp/distributed/partition_kernels.cpp index c1549989384..25b7b0bfce8 100644 --- a/omp/distributed/partition_kernels.cpp +++ b/omp/distributed/partition_kernels.cpp @@ -4,13 +4,10 @@ #include "core/distributed/partition_kernels.hpp" - #include - #include - #include "core/base/allocator.hpp" diff --git a/omp/distributed/vector_kernels.cpp b/omp/distributed/vector_kernels.cpp index e4daf7d5602..1ae60ed108e 100644 --- a/omp/distributed/vector_kernels.cpp +++ b/omp/distributed/vector_kernels.cpp @@ -4,7 +4,6 @@ #include "core/distributed/vector_kernels.hpp" - #include "core/components/prefix_sum_kernels.hpp" #include "reference/distributed/partition_helpers.hpp" diff --git a/omp/factorization/cholesky_kernels.cpp b/omp/factorization/cholesky_kernels.cpp index 19d31647b88..8ce5392ebde 100644 --- a/omp/factorization/cholesky_kernels.cpp +++ b/omp/factorization/cholesky_kernels.cpp @@ -4,14 +4,11 @@ #include "core/factorization/cholesky_kernels.hpp" - #include #include - #include - #include "core/base/iterator_factory.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" diff --git a/omp/factorization/factorization_kernels.cpp b/omp/factorization/factorization_kernels.cpp index 6d042114e69..f4b41cbdac5 100644 --- a/omp/factorization/factorization_kernels.cpp +++ b/omp/factorization/factorization_kernels.cpp @@ -4,15 +4,12 @@ #include "core/factorization/factorization_kernels.hpp" - #include #include - #include #include - #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" diff --git a/omp/factorization/lu_kernels.cpp b/omp/factorization/lu_kernels.cpp index c942991b13a..53847ff2b6c 100644 --- a/omp/factorization/lu_kernels.cpp +++ b/omp/factorization/lu_kernels.cpp @@ -4,14 +4,11 @@ #include "core/factorization/lu_kernels.hpp" - #include #include - #include - #include "core/base/allocator.hpp" #include "core/matrix/csr_lookup.hpp" diff --git a/omp/factorization/par_ic_kernels.cpp b/omp/factorization/par_ic_kernels.cpp index 48f4047875e..93093783acc 100644 --- a/omp/factorization/par_ic_kernels.cpp +++ b/omp/factorization/par_ic_kernels.cpp @@ -4,12 +4,10 @@ #include "core/factorization/par_ic_kernels.hpp" - #include #include #include - #include "core/base/utils.hpp" diff --git a/omp/factorization/par_ict_kernels.cpp b/omp/factorization/par_ict_kernels.cpp index d997531c304..b5546e1a644 100644 --- a/omp/factorization/par_ict_kernels.cpp +++ b/omp/factorization/par_ict_kernels.cpp @@ -4,19 +4,16 @@ #include "core/factorization/par_ict_kernels.hpp" - #include #include #include #include - #include #include #include #include - #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" diff --git a/omp/factorization/par_ilu_kernels.cpp b/omp/factorization/par_ilu_kernels.cpp index 44ab0cf9cc0..da42a631b81 100644 --- a/omp/factorization/par_ilu_kernels.cpp +++ b/omp/factorization/par_ilu_kernels.cpp @@ -4,10 +4,8 @@ #include "core/factorization/par_ilu_kernels.hpp" - #include - #include #include #include diff --git a/omp/factorization/par_ilut_kernels.cpp b/omp/factorization/par_ilut_kernels.cpp index 48d97920a88..a24709e4f1a 100644 --- a/omp/factorization/par_ilut_kernels.cpp +++ b/omp/factorization/par_ilut_kernels.cpp @@ -4,22 +4,18 @@ #include "core/factorization/par_ilut_kernels.hpp" - #include #include #include #include - #include - #include #include #include #include - #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" diff --git a/omp/matrix/batch_csr_kernels.cpp b/omp/matrix/batch_csr_kernels.cpp index e40b06350bb..eacb26c12cb 100644 --- a/omp/matrix/batch_csr_kernels.cpp +++ b/omp/matrix/batch_csr_kernels.cpp @@ -4,14 +4,11 @@ #include "core/matrix/batch_csr_kernels.hpp" - #include - #include #include - #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" diff --git a/omp/matrix/batch_dense_kernels.cpp b/omp/matrix/batch_dense_kernels.cpp index a8cf119f02d..836908260a7 100644 --- a/omp/matrix/batch_dense_kernels.cpp +++ b/omp/matrix/batch_dense_kernels.cpp @@ -4,14 +4,11 @@ #include "core/matrix/batch_dense_kernels.hpp" - #include - #include #include - #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" diff --git a/omp/matrix/batch_ell_kernels.cpp b/omp/matrix/batch_ell_kernels.cpp index 74cb4e06aa1..4fb5aeea6fa 100644 --- a/omp/matrix/batch_ell_kernels.cpp +++ b/omp/matrix/batch_ell_kernels.cpp @@ -4,14 +4,11 @@ #include "core/matrix/batch_ell_kernels.hpp" - #include - #include #include - #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" diff --git a/omp/matrix/coo_kernels.cpp b/omp/matrix/coo_kernels.cpp index e0f606b448f..021795d8e9c 100644 --- a/omp/matrix/coo_kernels.cpp +++ b/omp/matrix/coo_kernels.cpp @@ -4,19 +4,15 @@ #include "core/matrix/coo_kernels.hpp" - #include - #include - #include #include #include #include - #include "core/matrix/dense_kernels.hpp" #include "omp/components/atomic.hpp" diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 70df9f07944..09d1465896b 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -4,16 +4,13 @@ #include "core/matrix/csr_kernels.hpp" - #include #include #include #include - #include - #include #include #include @@ -22,7 +19,6 @@ #include #include - #include "core/base/allocator.hpp" #include "core/base/index_set_kernels.hpp" #include "core/base/iterator_factory.hpp" diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp index 20e09f2a747..d1c0f2f8949 100644 --- a/omp/matrix/dense_kernels.cpp +++ b/omp/matrix/dense_kernels.cpp @@ -4,13 +4,10 @@ #include "core/matrix/dense_kernels.hpp" - #include - #include - #include #include #include @@ -23,7 +20,6 @@ #include #include - #include "accessor/block_col_major.hpp" #include "accessor/range.hpp" #include "core/components/prefix_sum_kernels.hpp" diff --git a/omp/matrix/diagonal_kernels.cpp b/omp/matrix/diagonal_kernels.cpp index 622c195755b..71363c7bc6e 100644 --- a/omp/matrix/diagonal_kernels.cpp +++ b/omp/matrix/diagonal_kernels.cpp @@ -4,10 +4,8 @@ #include "core/matrix/diagonal_kernels.hpp" - #include - #include #include diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp index aa7c7f35bd1..c35a3654b86 100644 --- a/omp/matrix/ell_kernels.cpp +++ b/omp/matrix/ell_kernels.cpp @@ -4,19 +4,15 @@ #include "core/matrix/ell_kernels.hpp" - #include - #include - #include #include #include #include - #include "accessor/reduced_row_major.hpp" #include "core/base/mixed_precision_types.hpp" diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp index 0008c3c19a6..db60d85db79 100644 --- a/omp/matrix/fbcsr_kernels.cpp +++ b/omp/matrix/fbcsr_kernels.cpp @@ -4,21 +4,17 @@ #include "core/matrix/fbcsr_kernels.hpp" - #include #include #include - #include - #include #include #include #include - #include "accessor/block_col_major.hpp" #include "core/base/allocator.hpp" #include "core/base/block_sizes.hpp" diff --git a/omp/matrix/fft_kernels.cpp b/omp/matrix/fft_kernels.cpp index 1a7ae601fb6..0301b9093ff 100644 --- a/omp/matrix/fft_kernels.cpp +++ b/omp/matrix/fft_kernels.cpp @@ -4,12 +4,10 @@ #include "core/matrix/fft_kernels.hpp" - #include #include #include - #include "core/base/allocator.hpp" diff --git a/omp/matrix/sellp_kernels.cpp b/omp/matrix/sellp_kernels.cpp index a657d5d54a7..7f8b16264ce 100644 --- a/omp/matrix/sellp_kernels.cpp +++ b/omp/matrix/sellp_kernels.cpp @@ -4,13 +4,10 @@ #include "core/matrix/sellp_kernels.hpp" - #include - #include - #include diff --git a/omp/matrix/sparsity_csr_kernels.cpp b/omp/matrix/sparsity_csr_kernels.cpp index 5782e764845..35bb42c70a6 100644 --- a/omp/matrix/sparsity_csr_kernels.cpp +++ b/omp/matrix/sparsity_csr_kernels.cpp @@ -4,20 +4,16 @@ #include "core/matrix/sparsity_csr_kernels.hpp" - #include #include #include - #include - #include #include #include - #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" diff --git a/omp/multigrid/pgm_kernels.cpp b/omp/multigrid/pgm_kernels.cpp index 09279c6db21..9d2aa047cc4 100644 --- a/omp/multigrid/pgm_kernels.cpp +++ b/omp/multigrid/pgm_kernels.cpp @@ -4,18 +4,14 @@ #include "core/multigrid/pgm_kernels.hpp" - #include #include - #include - #include #include - #include "core/base/iterator_factory.hpp" diff --git a/omp/preconditioner/batch_jacobi_kernels.cpp b/omp/preconditioner/batch_jacobi_kernels.cpp index 15c7f0ab471..9dfe06be32b 100644 --- a/omp/preconditioner/batch_jacobi_kernels.cpp +++ b/omp/preconditioner/batch_jacobi_kernels.cpp @@ -4,7 +4,6 @@ #include "core/preconditioner/batch_jacobi_kernels.hpp" - #include "core/base/batch_struct.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/batch_struct.hpp" diff --git a/omp/preconditioner/isai_kernels.cpp b/omp/preconditioner/isai_kernels.cpp index b3af8d3a926..6f2fe4838d9 100644 --- a/omp/preconditioner/isai_kernels.cpp +++ b/omp/preconditioner/isai_kernels.cpp @@ -4,20 +4,16 @@ #include "core/preconditioner/isai_kernels.hpp" - #include #include - #include - #include #include #include #include - #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" diff --git a/omp/preconditioner/jacobi_kernels.cpp b/omp/preconditioner/jacobi_kernels.cpp index 686cb4d9777..76224f97a2f 100644 --- a/omp/preconditioner/jacobi_kernels.cpp +++ b/omp/preconditioner/jacobi_kernels.cpp @@ -4,23 +4,19 @@ #include "core/preconditioner/jacobi_kernels.hpp" - #include #include #include #include #include - #include - #include #include #include #include - #include "core/base/allocator.hpp" #include "core/base/extended_float.hpp" #include "core/preconditioner/jacobi_utils.hpp" diff --git a/omp/reorder/rcm_kernels.cpp b/omp/reorder/rcm_kernels.cpp index 44f5b95f034..dd4eb020695 100644 --- a/omp/reorder/rcm_kernels.cpp +++ b/omp/reorder/rcm_kernels.cpp @@ -4,7 +4,6 @@ #include "core/reorder/rcm_kernels.hpp" - #include #include #include @@ -12,10 +11,8 @@ #include #include - #include - #include #include #include @@ -25,7 +22,6 @@ #include #include - #include "core/base/allocator.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "omp/components/omp_mutex.hpp" diff --git a/omp/solver/batch_bicgstab_kernels.cpp b/omp/solver/batch_bicgstab_kernels.cpp index 294365c2410..81df9c45e51 100644 --- a/omp/solver/batch_bicgstab_kernels.cpp +++ b/omp/solver/batch_bicgstab_kernels.cpp @@ -4,13 +4,10 @@ #include "core/solver/batch_bicgstab_kernels.hpp" - #include - #include - #include "core/solver/batch_dispatch.hpp" diff --git a/omp/solver/batch_cg_kernels.cpp b/omp/solver/batch_cg_kernels.cpp index bdfcd50e050..51c794ab597 100644 --- a/omp/solver/batch_cg_kernels.cpp +++ b/omp/solver/batch_cg_kernels.cpp @@ -4,13 +4,10 @@ #include "core/solver/batch_cg_kernels.hpp" - #include - #include - #include "core/solver/batch_dispatch.hpp" diff --git a/omp/solver/cb_gmres_kernels.cpp b/omp/solver/cb_gmres_kernels.cpp index 76cff297bff..a53294b9fbe 100644 --- a/omp/solver/cb_gmres_kernels.cpp +++ b/omp/solver/cb_gmres_kernels.cpp @@ -4,16 +4,13 @@ #include "core/solver/cb_gmres_kernels.hpp" - #include - #include #include #include #include - #include "common/unified/base/kernel_launch_reduction.hpp" #include "core/solver/cb_gmres_accessor.hpp" diff --git a/omp/solver/idr_kernels.cpp b/omp/solver/idr_kernels.cpp index 1d8ce7fec1b..a93002e4833 100644 --- a/omp/solver/idr_kernels.cpp +++ b/omp/solver/idr_kernels.cpp @@ -4,20 +4,16 @@ #include "core/solver/idr_kernels.hpp" - #include #include #include - #include - #include #include #include - #include "common/unified/base/kernel_launch_reduction.hpp" diff --git a/omp/solver/lower_trs_kernels.cpp b/omp/solver/lower_trs_kernels.cpp index ee5b8b4b5a2..6dac6b46078 100644 --- a/omp/solver/lower_trs_kernels.cpp +++ b/omp/solver/lower_trs_kernels.cpp @@ -4,13 +4,10 @@ #include "core/solver/lower_trs_kernels.hpp" - #include - #include - #include #include #include diff --git a/omp/solver/multigrid_kernels.cpp b/omp/solver/multigrid_kernels.cpp index 09ed8e4cba8..12e5bad8577 100644 --- a/omp/solver/multigrid_kernels.cpp +++ b/omp/solver/multigrid_kernels.cpp @@ -4,7 +4,6 @@ #include "core/solver/multigrid_kernels.hpp" - #include #include #include diff --git a/omp/solver/upper_trs_kernels.cpp b/omp/solver/upper_trs_kernels.cpp index 7e6793a45f4..ea05cabeb63 100644 --- a/omp/solver/upper_trs_kernels.cpp +++ b/omp/solver/upper_trs_kernels.cpp @@ -4,13 +4,10 @@ #include "core/solver/upper_trs_kernels.hpp" - #include - #include - #include #include #include diff --git a/omp/stop/criterion_kernels.cpp b/omp/stop/criterion_kernels.cpp index c345ad4ef39..65d880515d9 100644 --- a/omp/stop/criterion_kernels.cpp +++ b/omp/stop/criterion_kernels.cpp @@ -4,7 +4,6 @@ #include "core/stop/criterion_kernels.hpp" - #include diff --git a/omp/stop/residual_norm_kernels.cpp b/omp/stop/residual_norm_kernels.cpp index c72124640df..0ec4395a16b 100644 --- a/omp/stop/residual_norm_kernels.cpp +++ b/omp/stop/residual_norm_kernels.cpp @@ -4,10 +4,8 @@ #include "core/stop/residual_norm_kernels.hpp" - #include - #include #include diff --git a/omp/test/base/index_set.cpp b/omp/test/base/index_set.cpp index 52f54774116..98a11bb8720 100644 --- a/omp/test/base/index_set.cpp +++ b/omp/test/base/index_set.cpp @@ -2,18 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include +#include #include #include - #include "core/base/index_set_kernels.hpp" #include "core/test/utils.hpp" diff --git a/omp/test/base/kernel_launch.cpp b/omp/test/base/kernel_launch.cpp index b01ffa4214d..5644d67caee 100644 --- a/omp/test/base/kernel_launch.cpp +++ b/omp/test/base/kernel_launch.cpp @@ -4,20 +4,16 @@ #include "common/unified/base/kernel_launch.hpp" - #include #include - #include - #include #include #include #include - #include "common/unified/base/kernel_launch_reduction.hpp" #include "common/unified/base/kernel_launch_solver.hpp" #include "core/test/utils.hpp" diff --git a/omp/test/matrix/fbcsr_kernels.cpp b/omp/test/matrix/fbcsr_kernels.cpp index 51c35171ad5..97f1008d53d 100644 --- a/omp/test/matrix/fbcsr_kernels.cpp +++ b/omp/test/matrix/fbcsr_kernels.cpp @@ -2,26 +2,22 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/matrix/fbcsr_kernels.hpp" #include #include #include #include - #include - #include #include #include #include +#include #include - -#include "core/matrix/fbcsr_kernels.hpp" #include "core/test/matrix/fbcsr_sample.hpp" #include "core/test/utils.hpp" #include "core/test/utils/fb_matrix_generator.hpp" diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp index 0692f66b465..b0d20a6b826 100644 --- a/reference/base/batch_multi_vector_kernels.cpp +++ b/reference/base/batch_multi_vector_kernels.cpp @@ -4,15 +4,12 @@ #include "core/base/batch_multi_vector_kernels.hpp" - #include - #include #include #include - #include "core/base/batch_struct.hpp" #include "reference/base/batch_struct.hpp" diff --git a/reference/base/batch_struct.hpp b/reference/base/batch_struct.hpp index 20837cbecc5..e64cac3ba88 100644 --- a/reference/base/batch_struct.hpp +++ b/reference/base/batch_struct.hpp @@ -9,7 +9,6 @@ #include #include - #include "core/base/batch_struct.hpp" diff --git a/reference/base/device_matrix_data_kernels.cpp b/reference/base/device_matrix_data_kernels.cpp index 86912ab2d6e..f9a23b35e69 100644 --- a/reference/base/device_matrix_data_kernels.cpp +++ b/reference/base/device_matrix_data_kernels.cpp @@ -4,13 +4,10 @@ #include "core/base/device_matrix_data_kernels.hpp" - #include - #include - #include "core/components/prefix_sum_kernels.hpp" diff --git a/reference/base/index_set_kernels.cpp b/reference/base/index_set_kernels.cpp index edd0671e7d7..6f769472a6f 100644 --- a/reference/base/index_set_kernels.cpp +++ b/reference/base/index_set_kernels.cpp @@ -4,20 +4,17 @@ #include "core/base/index_set_kernels.hpp" - #include #include #include #include #include - #include #include #include #include - #include "core/base/allocator.hpp" diff --git a/reference/base/scoped_device_id.cpp b/reference/base/scoped_device_id.cpp index f0aa33412be..4be131dccea 100644 --- a/reference/base/scoped_device_id.cpp +++ b/reference/base/scoped_device_id.cpp @@ -5,7 +5,6 @@ #include #include - #include "core/base/noop_scoped_device_id_guard.hpp" diff --git a/reference/components/convert_ptrs.hpp b/reference/components/convert_ptrs.hpp index c1fa5542e71..4d8d21db4d8 100644 --- a/reference/components/convert_ptrs.hpp +++ b/reference/components/convert_ptrs.hpp @@ -5,7 +5,6 @@ #include #include - #include diff --git a/reference/components/csr_spgeam.hpp b/reference/components/csr_spgeam.hpp index 4fb52aaa8dd..e8f5f314c5e 100644 --- a/reference/components/csr_spgeam.hpp +++ b/reference/components/csr_spgeam.hpp @@ -8,10 +8,8 @@ #include - #include - #include "core/base/utils.hpp" diff --git a/reference/components/fill_array_kernels.cpp b/reference/components/fill_array_kernels.cpp index 3dc865d9b97..1649aa87982 100644 --- a/reference/components/fill_array_kernels.cpp +++ b/reference/components/fill_array_kernels.cpp @@ -4,7 +4,6 @@ #include "core/components/fill_array_kernels.hpp" - #include diff --git a/reference/components/format_conversion_kernels.cpp b/reference/components/format_conversion_kernels.cpp index 2fa201e544a..faac67c8e27 100644 --- a/reference/components/format_conversion_kernels.cpp +++ b/reference/components/format_conversion_kernels.cpp @@ -4,10 +4,8 @@ #include "core/components/format_conversion_kernels.hpp" - #include - #include "core/components/fill_array_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" diff --git a/reference/components/precision_conversion_kernels.cpp b/reference/components/precision_conversion_kernels.cpp index 3ce42fbe5cb..db12d9316ee 100644 --- a/reference/components/precision_conversion_kernels.cpp +++ b/reference/components/precision_conversion_kernels.cpp @@ -4,7 +4,6 @@ #include "core/components/precision_conversion_kernels.hpp" - #include diff --git a/reference/components/reduce_array_kernels.cpp b/reference/components/reduce_array_kernels.cpp index e2c497f219c..a70ef95a878 100644 --- a/reference/components/reduce_array_kernels.cpp +++ b/reference/components/reduce_array_kernels.cpp @@ -4,7 +4,6 @@ #include "core/components/reduce_array_kernels.hpp" - #include diff --git a/reference/distributed/index_map_kernels.cpp b/reference/distributed/index_map_kernels.cpp index 5f13581eee0..322a95c6cdb 100644 --- a/reference/distributed/index_map_kernels.cpp +++ b/reference/distributed/index_map_kernels.cpp @@ -4,10 +4,8 @@ #include "core/distributed/index_map_kernels.hpp" - #include - #include "core/base/allocator.hpp" #include "core/base/segmented_array.hpp" #include "reference/distributed/partition_helpers.hpp" diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp index 9b4ff9231df..95176b34656 100644 --- a/reference/distributed/matrix_kernels.cpp +++ b/reference/distributed/matrix_kernels.cpp @@ -4,7 +4,6 @@ #include "core/distributed/matrix_kernels.hpp" - #include "core/base/allocator.hpp" #include "core/base/device_matrix_data_kernels.hpp" #include "core/base/iterator_factory.hpp" diff --git a/reference/distributed/partition_helpers.hpp b/reference/distributed/partition_helpers.hpp index fda114b43c6..06bd1e11f32 100644 --- a/reference/distributed/partition_helpers.hpp +++ b/reference/distributed/partition_helpers.hpp @@ -8,7 +8,6 @@ #include - #include #include diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp index 4cb3d145038..b57daab2eaa 100644 --- a/reference/distributed/partition_helpers_kernels.cpp +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -4,7 +4,6 @@ #include "core/distributed/partition_helpers_kernels.hpp" - #include "core/base/iterator_factory.hpp" diff --git a/reference/distributed/vector_kernels.cpp b/reference/distributed/vector_kernels.cpp index de6e462cc2e..76a8be06a0f 100644 --- a/reference/distributed/vector_kernels.cpp +++ b/reference/distributed/vector_kernels.cpp @@ -4,7 +4,6 @@ #include "core/distributed/vector_kernels.hpp" - #include "core/components/prefix_sum_kernels.hpp" #include "reference/distributed/partition_helpers.hpp" diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp index d24bf0d74fd..2aeee99d45d 100644 --- a/reference/factorization/cholesky_kernels.cpp +++ b/reference/factorization/cholesky_kernels.cpp @@ -4,15 +4,12 @@ #include "core/factorization/cholesky_kernels.hpp" - #include #include #include - #include - #include "core/base/allocator.hpp" #include "core/base/iterator_factory.hpp" #include "core/components/fill_array_kernels.hpp" diff --git a/reference/factorization/factorization_kernels.cpp b/reference/factorization/factorization_kernels.cpp index 482bf75cb1d..085e2f62ecc 100644 --- a/reference/factorization/factorization_kernels.cpp +++ b/reference/factorization/factorization_kernels.cpp @@ -4,15 +4,12 @@ #include "core/factorization/factorization_kernels.hpp" - #include #include - #include #include - #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" diff --git a/reference/factorization/ic_kernels.cpp b/reference/factorization/ic_kernels.cpp index 28e5a00be6f..6f88467262a 100644 --- a/reference/factorization/ic_kernels.cpp +++ b/reference/factorization/ic_kernels.cpp @@ -4,10 +4,8 @@ #include "core/factorization/ic_kernels.hpp" - #include - #include "core/base/allocator.hpp" diff --git a/reference/factorization/ilu_kernels.cpp b/reference/factorization/ilu_kernels.cpp index 263a1b0de38..fdbe8a9e86f 100644 --- a/reference/factorization/ilu_kernels.cpp +++ b/reference/factorization/ilu_kernels.cpp @@ -4,13 +4,10 @@ #include "core/factorization/ilu_kernels.hpp" - #include - #include - #include "core/base/allocator.hpp" diff --git a/reference/factorization/lu_kernels.cpp b/reference/factorization/lu_kernels.cpp index 50398c2f980..d8516cffb49 100644 --- a/reference/factorization/lu_kernels.cpp +++ b/reference/factorization/lu_kernels.cpp @@ -4,14 +4,11 @@ #include "core/factorization/lu_kernels.hpp" - #include #include - #include - #include "core/base/allocator.hpp" #include "core/matrix/csr_lookup.hpp" diff --git a/reference/factorization/par_ic_kernels.cpp b/reference/factorization/par_ic_kernels.cpp index 1753bca0814..4da317cf201 100644 --- a/reference/factorization/par_ic_kernels.cpp +++ b/reference/factorization/par_ic_kernels.cpp @@ -4,12 +4,10 @@ #include "core/factorization/par_ic_kernels.hpp" - #include #include #include - #include "core/base/utils.hpp" diff --git a/reference/factorization/par_ict_kernels.cpp b/reference/factorization/par_ict_kernels.cpp index 52e5099c60f..684158d380c 100644 --- a/reference/factorization/par_ict_kernels.cpp +++ b/reference/factorization/par_ict_kernels.cpp @@ -4,18 +4,15 @@ #include "core/factorization/par_ict_kernels.hpp" - #include #include #include #include - #include #include #include - #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" diff --git a/reference/factorization/par_ilu_kernels.cpp b/reference/factorization/par_ilu_kernels.cpp index 2e2694b0f1f..44c2e5f66bc 100644 --- a/reference/factorization/par_ilu_kernels.cpp +++ b/reference/factorization/par_ilu_kernels.cpp @@ -4,10 +4,8 @@ #include "core/factorization/par_ilu_kernels.hpp" - #include - #include #include #include diff --git a/reference/factorization/par_ilut_kernels.cpp b/reference/factorization/par_ilut_kernels.cpp index 293a17e2b83..abef6e9b5f2 100644 --- a/reference/factorization/par_ilut_kernels.cpp +++ b/reference/factorization/par_ilut_kernels.cpp @@ -4,19 +4,16 @@ #include "core/factorization/par_ilut_kernels.hpp" - #include #include #include #include - #include #include #include #include - #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" diff --git a/reference/matrix/batch_csr_kernels.cpp b/reference/matrix/batch_csr_kernels.cpp index 6b5c8829cbd..7c6d9a6c000 100644 --- a/reference/matrix/batch_csr_kernels.cpp +++ b/reference/matrix/batch_csr_kernels.cpp @@ -4,14 +4,11 @@ #include "core/matrix/batch_csr_kernels.hpp" - #include - #include #include - #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" diff --git a/reference/matrix/batch_dense_kernels.cpp b/reference/matrix/batch_dense_kernels.cpp index 5fba7fce9b2..2116a691fb9 100644 --- a/reference/matrix/batch_dense_kernels.cpp +++ b/reference/matrix/batch_dense_kernels.cpp @@ -4,14 +4,11 @@ #include "core/matrix/batch_dense_kernels.hpp" - #include - #include #include - #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" diff --git a/reference/matrix/batch_ell_kernels.cpp b/reference/matrix/batch_ell_kernels.cpp index f1a0d6c4016..0d47f9ea601 100644 --- a/reference/matrix/batch_ell_kernels.cpp +++ b/reference/matrix/batch_ell_kernels.cpp @@ -4,14 +4,11 @@ #include "core/matrix/batch_ell_kernels.hpp" - #include - #include #include - #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp index 47d31061be3..90e1e445c9b 100644 --- a/reference/matrix/batch_struct.hpp +++ b/reference/matrix/batch_struct.hpp @@ -6,16 +6,13 @@ #define GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_ -#include "core/matrix/batch_struct.hpp" - - #include #include #include #include - #include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" namespace gko { diff --git a/reference/matrix/coo_kernels.cpp b/reference/matrix/coo_kernels.cpp index 9b8789f6d8c..f9bf9f5f33d 100644 --- a/reference/matrix/coo_kernels.cpp +++ b/reference/matrix/coo_kernels.cpp @@ -4,13 +4,11 @@ #include "core/matrix/coo_kernels.hpp" - #include #include #include #include - #include "core/components/format_conversion_kernels.hpp" #include "core/matrix/dense_kernels.hpp" diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index 711efdc9175..f7e2fab4411 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -4,13 +4,11 @@ #include "core/matrix/csr_kernels.hpp" - #include #include #include #include - #include #include #include @@ -21,7 +19,6 @@ #include #include - #include "core/base/allocator.hpp" #include "core/base/index_set_kernels.hpp" #include "core/base/iterator_factory.hpp" diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp index fa88c30bd19..53773a131fe 100644 --- a/reference/matrix/dense_kernels.cpp +++ b/reference/matrix/dense_kernels.cpp @@ -4,10 +4,8 @@ #include "core/matrix/dense_kernels.hpp" - #include - #include #include #include @@ -20,7 +18,6 @@ #include #include - #include "accessor/block_col_major.hpp" #include "accessor/range.hpp" #include "core/base/mixed_precision_types.hpp" diff --git a/reference/matrix/diagonal_kernels.cpp b/reference/matrix/diagonal_kernels.cpp index 6c41fa41170..028b7685c2b 100644 --- a/reference/matrix/diagonal_kernels.cpp +++ b/reference/matrix/diagonal_kernels.cpp @@ -4,7 +4,6 @@ #include "core/matrix/diagonal_kernels.hpp" - #include #include diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp index 13b7912669e..1fa37c4e250 100644 --- a/reference/matrix/ell_kernels.cpp +++ b/reference/matrix/ell_kernels.cpp @@ -4,13 +4,11 @@ #include "core/matrix/ell_kernels.hpp" - #include #include #include #include - #include "accessor/reduced_row_major.hpp" #include "core/base/mixed_precision_types.hpp" diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp index 3bae91dad68..9e60e380d9c 100644 --- a/reference/matrix/fbcsr_kernels.cpp +++ b/reference/matrix/fbcsr_kernels.cpp @@ -4,19 +4,16 @@ #include "core/matrix/fbcsr_kernels.hpp" - #include #include #include #include - #include #include #include #include - #include "accessor/block_col_major.hpp" #include "core/base/allocator.hpp" #include "core/base/block_sizes.hpp" diff --git a/reference/matrix/fft_kernels.cpp b/reference/matrix/fft_kernels.cpp index c262a0a0b7b..00af068803c 100644 --- a/reference/matrix/fft_kernels.cpp +++ b/reference/matrix/fft_kernels.cpp @@ -4,12 +4,10 @@ #include "core/matrix/fft_kernels.hpp" - #include #include #include - #include "core/base/allocator.hpp" diff --git a/reference/matrix/hybrid_kernels.cpp b/reference/matrix/hybrid_kernels.cpp index 70ccf8d5583..f2a06c321f2 100644 --- a/reference/matrix/hybrid_kernels.cpp +++ b/reference/matrix/hybrid_kernels.cpp @@ -4,7 +4,6 @@ #include "core/matrix/hybrid_kernels.hpp" - #include #include #include @@ -12,7 +11,6 @@ #include #include - #include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/ell_kernels.hpp" diff --git a/reference/matrix/scaled_permutation_kernels.cpp b/reference/matrix/scaled_permutation_kernels.cpp index 05c3d28ef49..b00e06f72f2 100644 --- a/reference/matrix/scaled_permutation_kernels.cpp +++ b/reference/matrix/scaled_permutation_kernels.cpp @@ -4,7 +4,6 @@ #include "core/matrix/scaled_permutation_kernels.hpp" - #include diff --git a/reference/matrix/sellp_kernels.cpp b/reference/matrix/sellp_kernels.cpp index 1f2aa604e2a..120194d6952 100644 --- a/reference/matrix/sellp_kernels.cpp +++ b/reference/matrix/sellp_kernels.cpp @@ -4,13 +4,11 @@ #include "core/matrix/sellp_kernels.hpp" - #include #include #include #include - #include "core/components/prefix_sum_kernels.hpp" diff --git a/reference/matrix/sparsity_csr_kernels.cpp b/reference/matrix/sparsity_csr_kernels.cpp index e4a3c6d13f6..c511a16a292 100644 --- a/reference/matrix/sparsity_csr_kernels.cpp +++ b/reference/matrix/sparsity_csr_kernels.cpp @@ -4,17 +4,14 @@ #include "core/matrix/sparsity_csr_kernels.hpp" - #include #include #include - #include #include #include - #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" diff --git a/reference/multigrid/pgm_kernels.cpp b/reference/multigrid/pgm_kernels.cpp index ea2d91b84c5..2a6e3252a9f 100644 --- a/reference/multigrid/pgm_kernels.cpp +++ b/reference/multigrid/pgm_kernels.cpp @@ -4,12 +4,10 @@ #include "core/multigrid/pgm_kernels.hpp" - #include #include #include - #include #include #include @@ -17,7 +15,6 @@ #include #include - #include "core/base/allocator.hpp" #include "core/base/iterator_factory.hpp" #include "core/components/prefix_sum_kernels.hpp" diff --git a/reference/preconditioner/batch_block_jacobi.hpp b/reference/preconditioner/batch_block_jacobi.hpp index 009108f1985..0ca4807cd3a 100644 --- a/reference/preconditioner/batch_block_jacobi.hpp +++ b/reference/preconditioner/batch_block_jacobi.hpp @@ -8,7 +8,6 @@ #include - #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/preconditioner/batch_jacobi_helpers.hpp" diff --git a/reference/preconditioner/batch_jacobi_kernels.cpp b/reference/preconditioner/batch_jacobi_kernels.cpp index d90a1621a65..3c03a21fae7 100644 --- a/reference/preconditioner/batch_jacobi_kernels.cpp +++ b/reference/preconditioner/batch_jacobi_kernels.cpp @@ -4,7 +4,6 @@ #include "core/preconditioner/batch_jacobi_kernels.hpp" - #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" diff --git a/reference/preconditioner/isai_kernels.cpp b/reference/preconditioner/isai_kernels.cpp index 2112c3b4e5c..55f56b5705e 100644 --- a/reference/preconditioner/isai_kernels.cpp +++ b/reference/preconditioner/isai_kernels.cpp @@ -4,17 +4,14 @@ #include "core/preconditioner/isai_kernels.hpp" - #include #include - #include #include #include #include - #include "core/matrix/csr_builder.hpp" diff --git a/reference/preconditioner/jacobi_kernels.cpp b/reference/preconditioner/jacobi_kernels.cpp index 6c37aa4c3aa..4eaf0988a00 100644 --- a/reference/preconditioner/jacobi_kernels.cpp +++ b/reference/preconditioner/jacobi_kernels.cpp @@ -4,19 +4,16 @@ #include "core/preconditioner/jacobi_kernels.hpp" - #include #include #include #include - #include #include #include #include - #include "core/base/allocator.hpp" #include "core/base/extended_float.hpp" #include "core/preconditioner/jacobi_utils.hpp" diff --git a/reference/reorder/rcm_kernels.cpp b/reference/reorder/rcm_kernels.cpp index 3c6c9567d36..ff4bcd70214 100644 --- a/reference/reorder/rcm_kernels.cpp +++ b/reference/reorder/rcm_kernels.cpp @@ -4,7 +4,6 @@ #include "core/reorder/rcm_kernels.hpp" - #include #include #include @@ -12,7 +11,6 @@ #include #include - #include #include #include @@ -22,7 +20,6 @@ #include #include - #include "core/base/allocator.hpp" diff --git a/reference/solver/batch_bicgstab_kernels.cpp b/reference/solver/batch_bicgstab_kernels.cpp index 00290eb9c81..97de157fb90 100644 --- a/reference/solver/batch_bicgstab_kernels.cpp +++ b/reference/solver/batch_bicgstab_kernels.cpp @@ -4,7 +4,6 @@ #include "core/solver/batch_bicgstab_kernels.hpp" - #include "core/solver/batch_dispatch.hpp" diff --git a/reference/solver/batch_cg_kernels.cpp b/reference/solver/batch_cg_kernels.cpp index 408828fce95..290fbc3718b 100644 --- a/reference/solver/batch_cg_kernels.cpp +++ b/reference/solver/batch_cg_kernels.cpp @@ -4,7 +4,6 @@ #include "core/solver/batch_cg_kernels.hpp" - #include "core/solver/batch_dispatch.hpp" diff --git a/reference/solver/bicg_kernels.cpp b/reference/solver/bicg_kernels.cpp index 8fc03dc42d4..dee2d30b8dc 100644 --- a/reference/solver/bicg_kernels.cpp +++ b/reference/solver/bicg_kernels.cpp @@ -4,7 +4,6 @@ #include "core/solver/bicg_kernels.hpp" - #include #include #include diff --git a/reference/solver/bicgstab_kernels.cpp b/reference/solver/bicgstab_kernels.cpp index 85facf0a6fb..31955a59c53 100644 --- a/reference/solver/bicgstab_kernels.cpp +++ b/reference/solver/bicgstab_kernels.cpp @@ -4,10 +4,8 @@ #include "core/solver/bicgstab_kernels.hpp" - #include - #include #include #include diff --git a/reference/solver/cb_gmres_kernels.cpp b/reference/solver/cb_gmres_kernels.cpp index 372253c94fb..5d41a0d0e00 100644 --- a/reference/solver/cb_gmres_kernels.cpp +++ b/reference/solver/cb_gmres_kernels.cpp @@ -4,15 +4,12 @@ #include "core/solver/cb_gmres_kernels.hpp" - #include - #include #include #include - #include "core/solver/cb_gmres_accessor.hpp" diff --git a/reference/solver/cg_kernels.cpp b/reference/solver/cg_kernels.cpp index b5ef5b6f050..5af15692414 100644 --- a/reference/solver/cg_kernels.cpp +++ b/reference/solver/cg_kernels.cpp @@ -4,7 +4,6 @@ #include "core/solver/cg_kernels.hpp" - #include #include #include diff --git a/reference/solver/cgs_kernels.cpp b/reference/solver/cgs_kernels.cpp index 2e56702a2ab..a5a5f8c5862 100644 --- a/reference/solver/cgs_kernels.cpp +++ b/reference/solver/cgs_kernels.cpp @@ -4,7 +4,6 @@ #include "core/solver/cgs_kernels.hpp" - #include #include #include diff --git a/reference/solver/common_gmres_kernels.cpp b/reference/solver/common_gmres_kernels.cpp index c1cb3ce6cd4..643c164b828 100644 --- a/reference/solver/common_gmres_kernels.cpp +++ b/reference/solver/common_gmres_kernels.cpp @@ -4,7 +4,6 @@ #include "core/solver/common_gmres_kernels.hpp" - #include #include #include @@ -12,7 +11,6 @@ #include #include - #include "core/solver/cb_gmres_kernels.hpp" diff --git a/reference/solver/fcg_kernels.cpp b/reference/solver/fcg_kernels.cpp index dae9a45cbf9..65b6bf27698 100644 --- a/reference/solver/fcg_kernels.cpp +++ b/reference/solver/fcg_kernels.cpp @@ -4,7 +4,6 @@ #include "core/solver/fcg_kernels.hpp" - #include #include #include diff --git a/reference/solver/gcr_kernels.cpp b/reference/solver/gcr_kernels.cpp index 822ca3874ac..531814c641e 100644 --- a/reference/solver/gcr_kernels.cpp +++ b/reference/solver/gcr_kernels.cpp @@ -4,7 +4,6 @@ #include "core/solver/gcr_kernels.hpp" - #include #include #include diff --git a/reference/solver/gmres_kernels.cpp b/reference/solver/gmres_kernels.cpp index a0bc15499cf..a0b22862998 100644 --- a/reference/solver/gmres_kernels.cpp +++ b/reference/solver/gmres_kernels.cpp @@ -4,7 +4,6 @@ #include "core/solver/gmres_kernels.hpp" - #include #include #include diff --git a/reference/solver/idr_kernels.cpp b/reference/solver/idr_kernels.cpp index c1a6eeebcb9..606def8a18b 100644 --- a/reference/solver/idr_kernels.cpp +++ b/reference/solver/idr_kernels.cpp @@ -4,12 +4,10 @@ #include "core/solver/idr_kernels.hpp" - #include #include #include - #include #include #include diff --git a/reference/solver/lower_trs_kernels.cpp b/reference/solver/lower_trs_kernels.cpp index 3a655656d29..ba02c9c838c 100644 --- a/reference/solver/lower_trs_kernels.cpp +++ b/reference/solver/lower_trs_kernels.cpp @@ -4,10 +4,8 @@ #include "core/solver/lower_trs_kernels.hpp" - #include - #include #include #include diff --git a/reference/solver/multigrid_kernels.cpp b/reference/solver/multigrid_kernels.cpp index ff3f7d97a20..b08c9857d3a 100644 --- a/reference/solver/multigrid_kernels.cpp +++ b/reference/solver/multigrid_kernels.cpp @@ -4,7 +4,6 @@ #include "core/solver/multigrid_kernels.hpp" - #include #include #include diff --git a/reference/solver/upper_trs_kernels.cpp b/reference/solver/upper_trs_kernels.cpp index c85ef2b172f..f0c23a9c4cc 100644 --- a/reference/solver/upper_trs_kernels.cpp +++ b/reference/solver/upper_trs_kernels.cpp @@ -4,10 +4,8 @@ #include "core/solver/upper_trs_kernels.hpp" - #include - #include #include #include diff --git a/reference/stop/criterion_kernels.cpp b/reference/stop/criterion_kernels.cpp index 1e77df0b63d..4a91429d784 100644 --- a/reference/stop/criterion_kernels.cpp +++ b/reference/stop/criterion_kernels.cpp @@ -4,7 +4,6 @@ #include "core/stop/criterion_kernels.hpp" - #include diff --git a/reference/stop/residual_norm_kernels.cpp b/reference/stop/residual_norm_kernels.cpp index d7e6783eace..ba2672edc28 100644 --- a/reference/stop/residual_norm_kernels.cpp +++ b/reference/stop/residual_norm_kernels.cpp @@ -4,10 +4,8 @@ #include "core/stop/residual_norm_kernels.hpp" - #include - #include #include #include diff --git a/reference/test/base/array.cpp b/reference/test/base/array.cpp index 4d2c3ea909b..be0396383e1 100644 --- a/reference/test/base/array.cpp +++ b/reference/test/base/array.cpp @@ -2,18 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - +#include #include - #include "core/test/utils.hpp" diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp index 85c66b8ae44..e673046a490 100644 --- a/reference/test/base/batch_multi_vector_kernels.cpp +++ b/reference/test/base/batch_multi_vector_kernels.cpp @@ -2,24 +2,20 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/base/batch_multi_vector_kernels.hpp" #include #include #include - #include - +#include #include #include #include #include - -#include "core/base/batch_multi_vector_kernels.hpp" #include "core/base/batch_utilities.hpp" #include "core/test/utils.hpp" #include "core/test/utils/batch_helpers.hpp" diff --git a/reference/test/base/combination.cpp b/reference/test/base/combination.cpp index 8f4d2362724..aea578f4e7e 100644 --- a/reference/test/base/combination.cpp +++ b/reference/test/base/combination.cpp @@ -2,18 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - +#include #include - #include "core/test/utils.hpp" diff --git a/reference/test/base/composition.cpp b/reference/test/base/composition.cpp index ed46c58aa73..f736edb53f9 100644 --- a/reference/test/base/composition.cpp +++ b/reference/test/base/composition.cpp @@ -2,18 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - +#include #include - #include "core/test/utils.hpp" diff --git a/reference/test/base/index_set.cpp b/reference/test/base/index_set.cpp index 71fdaeb4f13..0d35ecac495 100644 --- a/reference/test/base/index_set.cpp +++ b/reference/test/base/index_set.cpp @@ -2,20 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include +#include #include - #include "core/test/utils.hpp" diff --git a/reference/test/base/perturbation.cpp b/reference/test/base/perturbation.cpp index ad9774257ad..b6be9ab1563 100644 --- a/reference/test/base/perturbation.cpp +++ b/reference/test/base/perturbation.cpp @@ -2,18 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - +#include #include - #include "core/test/utils.hpp" diff --git a/reference/test/base/utils.cpp b/reference/test/base/utils.cpp index b3e4f9f9612..27a3a31c1e0 100644 --- a/reference/test/base/utils.cpp +++ b/reference/test/base/utils.cpp @@ -4,13 +4,10 @@ #include "core/base/utils.hpp" - #include - #include - #include #include #include @@ -19,7 +16,6 @@ #include #include - #include "core/test/utils.hpp" #include "core/test/utils/unsort_matrix.hpp" diff --git a/reference/test/components/absolute_array_kernels.cpp b/reference/test/components/absolute_array_kernels.cpp index b16dfdd9989..c192d540032 100644 --- a/reference/test/components/absolute_array_kernels.cpp +++ b/reference/test/components/absolute_array_kernels.cpp @@ -4,19 +4,15 @@ #include "core/components/absolute_array_kernels.hpp" - #include #include #include - #include - #include #include - #include "core/test/utils.hpp" diff --git a/reference/test/components/fill_array_kernels.cpp b/reference/test/components/fill_array_kernels.cpp index 2f534be94b8..d087c833c96 100644 --- a/reference/test/components/fill_array_kernels.cpp +++ b/reference/test/components/fill_array_kernels.cpp @@ -4,18 +4,14 @@ #include "core/components/fill_array_kernels.hpp" - #include #include #include - #include - #include - #include "core/test/utils.hpp" diff --git a/reference/test/components/format_conversion_kernels.cpp b/reference/test/components/format_conversion_kernels.cpp index f9f9ef828b5..e5d2c2d9692 100644 --- a/reference/test/components/format_conversion_kernels.cpp +++ b/reference/test/components/format_conversion_kernels.cpp @@ -4,17 +4,14 @@ #include "core/components/format_conversion_kernels.hpp" - #include #include #include #include #include - #include - #include "core/test/utils.hpp" diff --git a/reference/test/components/precision_conversion_kernels.cpp b/reference/test/components/precision_conversion_kernels.cpp index e251101e1e7..129758e0b95 100644 --- a/reference/test/components/precision_conversion_kernels.cpp +++ b/reference/test/components/precision_conversion_kernels.cpp @@ -8,13 +8,10 @@ #include #include - #include - #include - #include "core/test/utils.hpp" diff --git a/reference/test/components/prefix_sum_kernels.cpp b/reference/test/components/prefix_sum_kernels.cpp index c8820d5031c..00265442cce 100644 --- a/reference/test/components/prefix_sum_kernels.cpp +++ b/reference/test/components/prefix_sum_kernels.cpp @@ -4,20 +4,16 @@ #include "core/components/prefix_sum_kernels.hpp" - #include #include #include #include #include - #include - #include - #include "core/test/utils.hpp" diff --git a/reference/test/components/reduce_array_kernels.cpp b/reference/test/components/reduce_array_kernels.cpp index 2599f83178f..b88ec181261 100644 --- a/reference/test/components/reduce_array_kernels.cpp +++ b/reference/test/components/reduce_array_kernels.cpp @@ -4,16 +4,12 @@ #include "core/components/reduce_array_kernels.hpp" - #include - #include - #include - #include "core/test/utils.hpp" diff --git a/reference/test/distributed/index_map_kernels.cpp b/reference/test/distributed/index_map_kernels.cpp index 972db10654f..72b0a0e523b 100644 --- a/reference/test/distributed/index_map_kernels.cpp +++ b/reference/test/distributed/index_map_kernels.cpp @@ -2,22 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/distributed/index_map_kernels.hpp" #include #include #include - #include #include - #include +#include - -#include "core/distributed/index_map_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp index 5d96f4f9c64..a34844cbde9 100644 --- a/reference/test/distributed/matrix_kernels.cpp +++ b/reference/test/distributed/matrix_kernels.cpp @@ -2,22 +2,20 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/distributed/matrix_kernels.hpp" + #include #include #include - #include #include - #include #include #include #include - -#include "core/distributed/matrix_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/distributed/partition_helpers_kernels.cpp b/reference/test/distributed/partition_helpers_kernels.cpp index 08c17615dd6..18b95cb9dad 100644 --- a/reference/test/distributed/partition_helpers_kernels.cpp +++ b/reference/test/distributed/partition_helpers_kernels.cpp @@ -2,20 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/distributed/partition_helpers_kernels.hpp" + #include #include #include - #include #include - #include #include - -#include "core/distributed/partition_helpers_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/distributed/partition_kernels.cpp b/reference/test/distributed/partition_kernels.cpp index 426195eef25..e06f3cc4029 100644 --- a/reference/test/distributed/partition_kernels.cpp +++ b/reference/test/distributed/partition_kernels.cpp @@ -2,22 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/distributed/partition_kernels.hpp" #include #include #include - #include #include - #include +#include - -#include "core/distributed/partition_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/distributed/vector_kernels.cpp b/reference/test/distributed/vector_kernels.cpp index 0afe9787a48..7de3104b7fb 100644 --- a/reference/test/distributed/vector_kernels.cpp +++ b/reference/test/distributed/vector_kernels.cpp @@ -2,20 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/distributed/vector_kernels.hpp" + #include #include #include - #include #include - #include #include - -#include "core/distributed/vector_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp index 87433681d89..d63e491e26a 100644 --- a/reference/test/factorization/cholesky_kernels.cpp +++ b/reference/test/factorization/cholesky_kernels.cpp @@ -2,24 +2,20 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/factorization/cholesky_kernels.hpp" #include #include #include - #include - #include +#include #include #include - #include "core/components/prefix_sum_kernels.hpp" -#include "core/factorization/cholesky_kernels.hpp" #include "core/factorization/elimination_forest.hpp" #include "core/factorization/symbolic.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/reference/test/factorization/factorization.cpp b/reference/test/factorization/factorization.cpp index 7a9eab5c8a7..2ded81d4867 100644 --- a/reference/test/factorization/factorization.cpp +++ b/reference/test/factorization/factorization.cpp @@ -2,25 +2,20 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include - #include - #include #include #include +#include #include #include #include - #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" diff --git a/reference/test/factorization/ic_kernels.cpp b/reference/test/factorization/ic_kernels.cpp index 22deb9fa2b2..cdcb6b12bc8 100644 --- a/reference/test/factorization/ic_kernels.cpp +++ b/reference/test/factorization/ic_kernels.cpp @@ -2,23 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include - #include - #include +#include #include #include #include - #include "core/test/utils.hpp" diff --git a/reference/test/factorization/ilu_kernels.cpp b/reference/test/factorization/ilu_kernels.cpp index 23098724e75..c750ca93fc8 100644 --- a/reference/test/factorization/ilu_kernels.cpp +++ b/reference/test/factorization/ilu_kernels.cpp @@ -2,24 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include - #include - #include +#include #include #include #include - #include "core/test/utils.hpp" diff --git a/reference/test/factorization/lu_kernels.cpp b/reference/test/factorization/lu_kernels.cpp index 1b5baf54e15..f4a8b240b38 100644 --- a/reference/test/factorization/lu_kernels.cpp +++ b/reference/test/factorization/lu_kernels.cpp @@ -2,28 +2,24 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/factorization/lu_kernels.hpp" #include #include #include - #include - #include #include #include +#include #include #include - #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/cholesky_kernels.hpp" #include "core/factorization/elimination_forest.hpp" -#include "core/factorization/lu_kernels.hpp" #include "core/factorization/symbolic.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/matrix/csr_lookup.hpp" diff --git a/reference/test/factorization/par_ic_kernels.cpp b/reference/test/factorization/par_ic_kernels.cpp index f044d03194c..b9caf8c9e5e 100644 --- a/reference/test/factorization/par_ic_kernels.cpp +++ b/reference/test/factorization/par_ic_kernels.cpp @@ -2,25 +2,21 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/factorization/par_ic_kernels.hpp" #include #include #include - #include - #include +#include #include #include #include - #include "core/factorization/factorization_kernels.hpp" -#include "core/factorization/par_ic_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/factorization/par_ict_kernels.cpp b/reference/test/factorization/par_ict_kernels.cpp index 6506ed59b0e..55ac5771732 100644 --- a/reference/test/factorization/par_ict_kernels.cpp +++ b/reference/test/factorization/par_ict_kernels.cpp @@ -2,25 +2,21 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/factorization/par_ict_kernels.hpp" #include #include #include - #include - #include +#include #include #include #include - #include "core/factorization/factorization_kernels.hpp" -#include "core/factorization/par_ict_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/factorization/par_ilu_kernels.cpp b/reference/test/factorization/par_ilu_kernels.cpp index 25b172a25d6..bf4e422f640 100644 --- a/reference/test/factorization/par_ilu_kernels.cpp +++ b/reference/test/factorization/par_ilu_kernels.cpp @@ -2,26 +2,22 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/factorization/par_ilu_kernels.hpp" #include #include #include #include - #include - #include +#include #include #include #include - #include "core/factorization/factorization_kernels.hpp" -#include "core/factorization/par_ilu_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/factorization/par_ilut_kernels.cpp b/reference/test/factorization/par_ilut_kernels.cpp index 36b68fe2815..59805f246f8 100644 --- a/reference/test/factorization/par_ilut_kernels.cpp +++ b/reference/test/factorization/par_ilut_kernels.cpp @@ -2,24 +2,20 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/factorization/par_ilut_kernels.hpp" #include #include #include - #include - #include +#include #include #include #include - -#include "core/factorization/par_ilut_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/log/convergence.cpp b/reference/test/log/convergence.cpp index 986adb52e89..50db0db49c4 100644 --- a/reference/test/log/convergence.cpp +++ b/reference/test/log/convergence.cpp @@ -2,18 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include +#include #include #include - #include "core/test/utils.hpp" diff --git a/reference/test/log/papi.cpp b/reference/test/log/papi.cpp index 54e35f1218e..4f1d9e469f1 100644 --- a/reference/test/log/papi.cpp +++ b/reference/test/log/papi.cpp @@ -2,18 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include - #include +#include #include #include - #include "core/test/utils.hpp" diff --git a/reference/test/matrix/batch_csr_kernels.cpp b/reference/test/matrix/batch_csr_kernels.cpp index 76ff47be730..920bb67696b 100644 --- a/reference/test/matrix/batch_csr_kernels.cpp +++ b/reference/test/matrix/batch_csr_kernels.cpp @@ -2,26 +2,22 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/matrix/batch_csr_kernels.hpp" #include #include #include - #include - #include #include #include #include +#include #include #include - -#include "core/matrix/batch_csr_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/matrix/batch_dense_kernels.cpp b/reference/test/matrix/batch_dense_kernels.cpp index 1eabb3cb749..50c1909959f 100644 --- a/reference/test/matrix/batch_dense_kernels.cpp +++ b/reference/test/matrix/batch_dense_kernels.cpp @@ -2,25 +2,21 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/matrix/batch_dense_kernels.hpp" #include #include #include - #include - #include #include #include #include +#include #include - -#include "core/matrix/batch_dense_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/matrix/batch_ell_kernels.cpp b/reference/test/matrix/batch_ell_kernels.cpp index 44eda90cca9..a2c9ef4e83c 100644 --- a/reference/test/matrix/batch_ell_kernels.cpp +++ b/reference/test/matrix/batch_ell_kernels.cpp @@ -2,26 +2,22 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/matrix/batch_ell_kernels.hpp" #include #include #include - #include - #include #include #include #include +#include #include #include - -#include "core/matrix/batch_ell_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp index 19752d27a1b..42b68d1cb4c 100644 --- a/reference/test/matrix/coo_kernels.cpp +++ b/reference/test/matrix/coo_kernels.cpp @@ -2,24 +2,20 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/matrix/coo_kernels.hpp" #include #include - #include - #include #include +#include #include #include #include - -#include "core/matrix/coo_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index a206c8c40c2..2d4c61786ad 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -2,20 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/matrix/csr_kernels.hpp" #include - #include - #include #include #include #include #include +#include #include #include #include @@ -26,8 +24,6 @@ #include #include - -#include "core/matrix/csr_kernels.hpp" #include "core/matrix/csr_lookup.hpp" #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index c0bd7fd363b..41294c89d49 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -2,24 +2,22 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/matrix/dense_kernels.hpp" #include #include #include #include - #include - #include #include #include #include #include #include +#include #include #include #include @@ -28,8 +26,6 @@ #include #include - -#include "core/matrix/dense_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/matrix/diagonal_kernels.cpp b/reference/test/matrix/diagonal_kernels.cpp index 6baa11a8e60..208c9d98639 100644 --- a/reference/test/matrix/diagonal_kernels.cpp +++ b/reference/test/matrix/diagonal_kernels.cpp @@ -2,25 +2,21 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/matrix/diagonal_kernels.hpp" #include #include #include #include - #include - #include #include #include #include +#include - -#include "core/matrix/diagonal_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp index 30128021c09..c96dcae773a 100644 --- a/reference/test/matrix/ell_kernels.cpp +++ b/reference/test/matrix/ell_kernels.cpp @@ -2,22 +2,17 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include #include #include #include #include - +#include #include "core/test/utils.hpp" diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp index 576193ba50e..cd82bade8b7 100644 --- a/reference/test/matrix/fbcsr_kernels.cpp +++ b/reference/test/matrix/fbcsr_kernels.cpp @@ -2,28 +2,24 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/matrix/fbcsr_kernels.hpp" #include #include - #include - #include #include #include #include #include #include +#include #include #include - #include "core/matrix/csr_kernels.hpp" -#include "core/matrix/fbcsr_kernels.hpp" #include "core/test/matrix/fbcsr_sample.hpp" #include "core/test/utils.hpp" #include "core/test/utils/value_generator.hpp" diff --git a/reference/test/matrix/fft_kernels.cpp b/reference/test/matrix/fft_kernels.cpp index b157b992e49..12c2521b71c 100644 --- a/reference/test/matrix/fft_kernels.cpp +++ b/reference/test/matrix/fft_kernels.cpp @@ -2,21 +2,16 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include - #include - #include #include #include - +#include #include "core/test/utils.hpp" #include "matrices/config.hpp" diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp index 32ee8853163..014b5bb1024 100644 --- a/reference/test/matrix/hybrid_kernels.cpp +++ b/reference/test/matrix/hybrid_kernels.cpp @@ -2,24 +2,20 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/matrix/hybrid_kernels.hpp" #include - #include - #include #include #include #include #include #include +#include - -#include "core/matrix/hybrid_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/matrix/identity.cpp b/reference/test/matrix/identity.cpp index 345efb99261..11953de338a 100644 --- a/reference/test/matrix/identity.cpp +++ b/reference/test/matrix/identity.cpp @@ -2,14 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - +#include #include "core/test/utils.hpp" diff --git a/reference/test/matrix/permutation.cpp b/reference/test/matrix/permutation.cpp index bb80dcc4736..5418f97353b 100644 --- a/reference/test/matrix/permutation.cpp +++ b/reference/test/matrix/permutation.cpp @@ -2,20 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include #include #include - +#include #include "core/test/utils.hpp" diff --git a/reference/test/matrix/scaled_permutation.cpp b/reference/test/matrix/scaled_permutation.cpp index 87924687148..ba65705bf29 100644 --- a/reference/test/matrix/scaled_permutation.cpp +++ b/reference/test/matrix/scaled_permutation.cpp @@ -2,19 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include #include - +#include #include "core/test/utils.hpp" diff --git a/reference/test/matrix/sellp_kernels.cpp b/reference/test/matrix/sellp_kernels.cpp index 87ef9d19b90..18cf793c7f3 100644 --- a/reference/test/matrix/sellp_kernels.cpp +++ b/reference/test/matrix/sellp_kernels.cpp @@ -2,21 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/matrix/sellp_kernels.hpp" #include - #include #include #include #include #include #include +#include - -#include "core/matrix/sellp_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/matrix/sparsity_csr.cpp b/reference/test/matrix/sparsity_csr.cpp index 1dead5e362a..d8ed6147e30 100644 --- a/reference/test/matrix/sparsity_csr.cpp +++ b/reference/test/matrix/sparsity_csr.cpp @@ -2,18 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include #include #include #include - +#include #include "core/test/utils.hpp" diff --git a/reference/test/matrix/sparsity_csr_kernels.cpp b/reference/test/matrix/sparsity_csr_kernels.cpp index 3616ae0a91e..f08d6c352ca 100644 --- a/reference/test/matrix/sparsity_csr_kernels.cpp +++ b/reference/test/matrix/sparsity_csr_kernels.cpp @@ -2,22 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/matrix/sparsity_csr_kernels.hpp" #include - #include - #include #include #include #include +#include - -#include "core/matrix/sparsity_csr_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/multigrid/fixed_coarsening_kernels.cpp b/reference/test/multigrid/fixed_coarsening_kernels.cpp index c6b577523da..b79b1b578dd 100644 --- a/reference/test/multigrid/fixed_coarsening_kernels.cpp +++ b/reference/test/multigrid/fixed_coarsening_kernels.cpp @@ -2,15 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include #include @@ -20,12 +15,12 @@ #include #include #include +#include #include #include #include #include - #include "core/test/utils.hpp" diff --git a/reference/test/multigrid/pgm_kernels.cpp b/reference/test/multigrid/pgm_kernels.cpp index 6f80f27e040..2fc754f23b3 100644 --- a/reference/test/multigrid/pgm_kernels.cpp +++ b/reference/test/multigrid/pgm_kernels.cpp @@ -2,15 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/multigrid/pgm_kernels.hpp" #include - #include - #include #include #include @@ -19,13 +16,12 @@ #include #include #include +#include #include #include #include #include - -#include "core/multigrid/pgm_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/preconditioner/batch_jacobi_kernels.cpp b/reference/test/preconditioner/batch_jacobi_kernels.cpp index 520e6c11f31..afc59c0f783 100644 --- a/reference/test/preconditioner/batch_jacobi_kernels.cpp +++ b/reference/test/preconditioner/batch_jacobi_kernels.cpp @@ -2,23 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/preconditioner/batch_jacobi_kernels.hpp" #include - #include - #include #include #include #include +#include #include - -#include "core/preconditioner/batch_jacobi_kernels.hpp" #include "core/test/utils.hpp" #include "core/test/utils/batch_helpers.hpp" diff --git a/reference/test/preconditioner/ic.cpp b/reference/test/preconditioner/ic.cpp index 1e1bc18bda1..16ffc8d7b3c 100644 --- a/reference/test/preconditioner/ic.cpp +++ b/reference/test/preconditioner/ic.cpp @@ -2,24 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include #include #include #include #include +#include #include - #include "core/test/utils.hpp" diff --git a/reference/test/preconditioner/ilu.cpp b/reference/test/preconditioner/ilu.cpp index 7b8bd657955..180b92be9ec 100644 --- a/reference/test/preconditioner/ilu.cpp +++ b/reference/test/preconditioner/ilu.cpp @@ -2,27 +2,22 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include #include #include #include +#include #include #include #include #include #include - #include "core/test/utils.hpp" diff --git a/reference/test/preconditioner/isai_kernels.cpp b/reference/test/preconditioner/isai_kernels.cpp index 007f0e428c9..e989125c61d 100644 --- a/reference/test/preconditioner/isai_kernels.cpp +++ b/reference/test/preconditioner/isai_kernels.cpp @@ -2,29 +2,25 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/preconditioner/isai_kernels.hpp" #include #include #include #include - #include - #include #include #include #include #include +#include #include #include - #include "core/base/utils.hpp" -#include "core/preconditioner/isai_kernels.hpp" #include "core/test/utils.hpp" #include "matrices/config.hpp" diff --git a/reference/test/preconditioner/jacobi.cpp b/reference/test/preconditioner/jacobi.cpp index 959a2a6e3b6..801250a9826 100644 --- a/reference/test/preconditioner/jacobi.cpp +++ b/reference/test/preconditioner/jacobi.cpp @@ -2,18 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include - +#include #include "core/base/extended_float.hpp" #include "core/preconditioner/jacobi_utils.hpp" diff --git a/reference/test/preconditioner/jacobi_kernels.cpp b/reference/test/preconditioner/jacobi_kernels.cpp index 8e88310dfb9..97d9951be7a 100644 --- a/reference/test/preconditioner/jacobi_kernels.cpp +++ b/reference/test/preconditioner/jacobi_kernels.cpp @@ -2,19 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include - #include - #include #include - +#include #include "core/base/extended_float.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/reorder/mc64.cpp b/reference/test/reorder/mc64.cpp index 64ddb667c14..2c64538e9b2 100644 --- a/reference/test/reorder/mc64.cpp +++ b/reference/test/reorder/mc64.cpp @@ -2,17 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include - #include - #include #include #include @@ -20,7 +15,7 @@ #include #include #include - +#include #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" diff --git a/reference/test/reorder/mc64_kernels.cpp b/reference/test/reorder/mc64_kernels.cpp index 85352854934..15f90839e1b 100644 --- a/reference/test/reorder/mc64_kernels.cpp +++ b/reference/test/reorder/mc64_kernels.cpp @@ -2,8 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/reorder/mc64.hpp" #include #include @@ -11,17 +10,14 @@ #include #include - #include - #include #include #include - +#include #include "core/components/addressable_pq.hpp" -#include "core/reorder/mc64.hpp" #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" #include "matrices/config.hpp" diff --git a/reference/test/reorder/nested_dissection.cpp b/reference/test/reorder/nested_dissection.cpp index c5054a19f6f..64a900f7742 100644 --- a/reference/test/reorder/nested_dissection.cpp +++ b/reference/test/reorder/nested_dissection.cpp @@ -2,13 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include + +#include #include GKO_METIS_HEADER @@ -18,7 +16,6 @@ #include #include - #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" diff --git a/reference/test/reorder/rcm.cpp b/reference/test/reorder/rcm.cpp index e1a330c046f..f8a18e5b6ec 100644 --- a/reference/test/reorder/rcm.cpp +++ b/reference/test/reorder/rcm.cpp @@ -2,22 +2,17 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include - #include - #include #include #include #include - +#include #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" diff --git a/reference/test/reorder/rcm_kernels.cpp b/reference/test/reorder/rcm_kernels.cpp index 0f36839ba33..f9d44f2dfd6 100644 --- a/reference/test/reorder/rcm_kernels.cpp +++ b/reference/test/reorder/rcm_kernels.cpp @@ -2,22 +2,17 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include - #include - #include #include #include #include - +#include #include "core/test/utils/assertions.hpp" diff --git a/reference/test/reorder/scaled_reordered.cpp b/reference/test/reorder/scaled_reordered.cpp index d511ed4a3f4..75ab3728a30 100644 --- a/reference/test/reorder/scaled_reordered.cpp +++ b/reference/test/reorder/scaled_reordered.cpp @@ -2,27 +2,22 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include #include #include #include #include +#include #include #include #include #include - #include "core/test/utils.hpp" diff --git a/reference/test/solver/batch_bicgstab_kernels.cpp b/reference/test/solver/batch_bicgstab_kernels.cpp index 2051b1764b4..ddb6d09e12a 100644 --- a/reference/test/solver/batch_bicgstab_kernels.cpp +++ b/reference/test/solver/batch_bicgstab_kernels.cpp @@ -2,26 +2,22 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/solver/batch_bicgstab_kernels.hpp" #include #include - #include - #include #include #include #include #include - +#include #include "core/base/batch_utilities.hpp" #include "core/matrix/batch_dense_kernels.hpp" -#include "core/solver/batch_bicgstab_kernels.hpp" #include "core/test/utils.hpp" #include "core/test/utils/batch_helpers.hpp" diff --git a/reference/test/solver/batch_cg_kernels.cpp b/reference/test/solver/batch_cg_kernels.cpp index a3c5dde92bc..4ccabfb8849 100644 --- a/reference/test/solver/batch_cg_kernels.cpp +++ b/reference/test/solver/batch_cg_kernels.cpp @@ -2,26 +2,22 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/solver/batch_cg_kernels.hpp" #include #include - #include - #include #include #include #include #include - +#include #include "core/base/batch_utilities.hpp" #include "core/matrix/batch_dense_kernels.hpp" -#include "core/solver/batch_cg_kernels.hpp" #include "core/test/utils.hpp" #include "core/test/utils/batch_helpers.hpp" diff --git a/reference/test/solver/bicg_kernels.cpp b/reference/test/solver/bicg_kernels.cpp index c615015ce63..837920ec520 100644 --- a/reference/test/solver/bicg_kernels.cpp +++ b/reference/test/solver/bicg_kernels.cpp @@ -2,22 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/solver/bicg_kernels.hpp" #include - #include #include #include +#include #include #include #include #include - -#include "core/solver/bicg_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/solver/bicgstab_kernels.cpp b/reference/test/solver/bicgstab_kernels.cpp index 32966138310..f09e78137b3 100644 --- a/reference/test/solver/bicgstab_kernels.cpp +++ b/reference/test/solver/bicgstab_kernels.cpp @@ -2,22 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/solver/bicgstab_kernels.hpp" #include - #include #include #include +#include #include #include #include #include - -#include "core/solver/bicgstab_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/solver/cb_gmres_kernels.cpp b/reference/test/solver/cb_gmres_kernels.cpp index 95c31bbd500..eeeca82494c 100644 --- a/reference/test/solver/cb_gmres_kernels.cpp +++ b/reference/test/solver/cb_gmres_kernels.cpp @@ -2,25 +2,20 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include #include #include +#include #include #include #include #include - #include "core/test/utils.hpp" diff --git a/reference/test/solver/cg_kernels.cpp b/reference/test/solver/cg_kernels.cpp index 6892d322906..7cbc629717c 100644 --- a/reference/test/solver/cg_kernels.cpp +++ b/reference/test/solver/cg_kernels.cpp @@ -2,22 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/solver/cg_kernels.hpp" #include - #include #include #include +#include #include #include #include #include - -#include "core/solver/cg_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/solver/cgs_kernels.cpp b/reference/test/solver/cgs_kernels.cpp index ee6bad8ab8f..9024623ade8 100644 --- a/reference/test/solver/cgs_kernels.cpp +++ b/reference/test/solver/cgs_kernels.cpp @@ -2,22 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/solver/cgs_kernels.hpp" #include - #include #include #include +#include #include #include #include #include - -#include "core/solver/cgs_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/solver/direct.cpp b/reference/test/solver/direct.cpp index 23b7a069b90..1fb147a7a2b 100644 --- a/reference/test/solver/direct.cpp +++ b/reference/test/solver/direct.cpp @@ -2,26 +2,21 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include - #include #include #include #include #include +#include #include #include #include #include - #include "core/test/utils.hpp" #include "core/test/utils/matrix_generator.hpp" #include "matrices/config.hpp" diff --git a/reference/test/solver/fcg_kernels.cpp b/reference/test/solver/fcg_kernels.cpp index de20c8e47d9..2b7b97ffc3b 100644 --- a/reference/test/solver/fcg_kernels.cpp +++ b/reference/test/solver/fcg_kernels.cpp @@ -2,22 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/solver/fcg_kernels.hpp" #include - #include #include #include +#include #include #include #include #include - -#include "core/solver/fcg_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/solver/gcr_kernels.cpp b/reference/test/solver/gcr_kernels.cpp index 498447e6763..a81c3ce4285 100644 --- a/reference/test/solver/gcr_kernels.cpp +++ b/reference/test/solver/gcr_kernels.cpp @@ -2,28 +2,24 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/solver/gcr_kernels.hpp" #include #include - #include - #include #include #include #include #include +#include #include #include #include #include - -#include "core/solver/gcr_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp index 7968bd2ac4f..00f7766179f 100644 --- a/reference/test/solver/gmres_kernels.cpp +++ b/reference/test/solver/gmres_kernels.cpp @@ -2,29 +2,25 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/solver/gmres_kernels.hpp" #include #include - #include - #include #include #include #include #include +#include #include #include #include #include - #include "core/solver/common_gmres_kernels.hpp" -#include "core/solver/gmres_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/solver/idr_kernels.cpp b/reference/test/solver/idr_kernels.cpp index 056ddf0dfe5..c3ca4fc1bd9 100644 --- a/reference/test/solver/idr_kernels.cpp +++ b/reference/test/solver/idr_kernels.cpp @@ -2,21 +2,17 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include #include +#include #include #include #include #include - #include "core/test/utils.hpp" diff --git a/reference/test/solver/ir_kernels.cpp b/reference/test/solver/ir_kernels.cpp index f41c1803afa..b0c1029f693 100644 --- a/reference/test/solver/ir_kernels.cpp +++ b/reference/test/solver/ir_kernels.cpp @@ -2,22 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/solver/ir_kernels.hpp" #include - #include #include #include #include +#include #include #include #include - -#include "core/solver/ir_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/solver/lower_trs.cpp b/reference/test/solver/lower_trs.cpp index b716235e587..d52ee028b53 100644 --- a/reference/test/solver/lower_trs.cpp +++ b/reference/test/solver/lower_trs.cpp @@ -4,16 +4,13 @@ #include - #include - #include #include #include #include - #include "core/test/utils.hpp" diff --git a/reference/test/solver/lower_trs_kernels.cpp b/reference/test/solver/lower_trs_kernels.cpp index 351550f2d44..da2e55700f5 100644 --- a/reference/test/solver/lower_trs_kernels.cpp +++ b/reference/test/solver/lower_trs_kernels.cpp @@ -2,12 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "core/solver/lower_trs_kernels.hpp" +#include #include - #include #include #include @@ -18,8 +18,6 @@ #include #include - -#include "core/solver/lower_trs_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/solver/multigrid_kernels.cpp b/reference/test/solver/multigrid_kernels.cpp index b7fa09058fb..57ba8fba84d 100644 --- a/reference/test/solver/multigrid_kernels.cpp +++ b/reference/test/solver/multigrid_kernels.cpp @@ -2,12 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include #include @@ -16,11 +12,11 @@ #include #include #include +#include #include #include #include - #include "core/test/utils.hpp" diff --git a/reference/test/solver/upper_trs.cpp b/reference/test/solver/upper_trs.cpp index f9d30a5a595..9980c51f9d1 100644 --- a/reference/test/solver/upper_trs.cpp +++ b/reference/test/solver/upper_trs.cpp @@ -4,16 +4,13 @@ #include - #include - #include #include #include #include - #include "core/test/utils.hpp" diff --git a/reference/test/solver/upper_trs_kernels.cpp b/reference/test/solver/upper_trs_kernels.cpp index 1c259b65e14..dc964e6b83d 100644 --- a/reference/test/solver/upper_trs_kernels.cpp +++ b/reference/test/solver/upper_trs_kernels.cpp @@ -2,12 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "core/solver/upper_trs_kernels.hpp" +#include #include - #include #include #include @@ -18,8 +18,6 @@ #include #include - -#include "core/solver/upper_trs_kernels.hpp" #include "core/test/utils.hpp" diff --git a/reference/test/stop/combined.cpp b/reference/test/stop/combined.cpp index 29183c29924..900e8131aba 100644 --- a/reference/test/stop/combined.cpp +++ b/reference/test/stop/combined.cpp @@ -2,11 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include + +#include #if defined(_WIN32) || defined(__CYGWIN__) #include #endif // defined(_WIN32) || defined(__CYGWIN__) @@ -14,7 +13,6 @@ #include - #include #include diff --git a/reference/test/stop/criterion_kernels.cpp b/reference/test/stop/criterion_kernels.cpp index 784544afe92..39ea9c72098 100644 --- a/reference/test/stop/criterion_kernels.cpp +++ b/reference/test/stop/criterion_kernels.cpp @@ -2,12 +2,9 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - +#include #include diff --git a/reference/test/stop/iteration.cpp b/reference/test/stop/iteration.cpp index ec869f82812..fbe53888c61 100644 --- a/reference/test/stop/iteration.cpp +++ b/reference/test/stop/iteration.cpp @@ -2,11 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include +#include + namespace { diff --git a/reference/test/stop/residual_norm_kernels.cpp b/reference/test/stop/residual_norm_kernels.cpp index 2996fe153a4..43b865796b7 100644 --- a/reference/test/stop/residual_norm_kernels.cpp +++ b/reference/test/stop/residual_norm_kernels.cpp @@ -2,18 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include #include - #include - +#include #include "core/test/utils.hpp" diff --git a/reference/test/stop/time.cpp b/reference/test/stop/time.cpp index 42eeb6e6a63..a5ea6107fbf 100644 --- a/reference/test/stop/time.cpp +++ b/reference/test/stop/time.cpp @@ -2,11 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include + +#include #if defined(_WIN32) || defined(__CYGWIN__) #include #endif // defined(_WIN32) || defined(__CYGWIN__) diff --git a/reference/test/utils/assertions_test.cpp b/reference/test/utils/assertions_test.cpp index 95286f7571a..98f1ec68e0d 100644 --- a/reference/test/utils/assertions_test.cpp +++ b/reference/test/utils/assertions_test.cpp @@ -4,14 +4,11 @@ #include "core/test/utils/assertions.hpp" - #include - #include #include - #include "core/test/utils.hpp" diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp index 07749d9bed2..d15e6d2165f 100644 --- a/test/base/batch_multi_vector_kernels.cpp +++ b/test/base/batch_multi_vector_kernels.cpp @@ -2,20 +2,16 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/base/batch_multi_vector_kernels.hpp" #include #include - #include - +#include #include - -#include "core/base/batch_multi_vector_kernels.hpp" #include "core/base/batch_utilities.hpp" #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" diff --git a/test/base/device_matrix_data_kernels.cpp b/test/base/device_matrix_data_kernels.cpp index 26dbcb73cf4..59c9ec209c3 100644 --- a/test/base/device_matrix_data_kernels.cpp +++ b/test/base/device_matrix_data_kernels.cpp @@ -2,22 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "core/base/device_matrix_data_kernels.hpp" #include #include - #include - #include +#include #include #include - -#include "core/base/device_matrix_data_kernels.hpp" #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" #include "test/utils/executor.hpp" diff --git a/test/base/executor.cpp b/test/base/executor.cpp index 541360d01d4..3b93d7e748a 100644 --- a/test/base/executor.cpp +++ b/test/base/executor.cpp @@ -2,17 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - +#include "test/utils/executor.hpp" #include - #include +#include #include "core/test/utils/assertions.hpp" -#include "test/utils/executor.hpp" namespace reference { diff --git a/test/base/index_range.cpp b/test/base/index_range.cpp index b16b5fb9046..8bb5519c457 100644 --- a/test/base/index_range.cpp +++ b/test/base/index_range.cpp @@ -2,17 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "core/base/index_range.hpp" +#include #include - #include - #include "common/unified/base/kernel_launch.hpp" -#include "core/base/index_range.hpp" #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp index c746a5b3461..8107e6a3eef 100644 --- a/test/base/kernel_launch_generic.cpp +++ b/test/base/kernel_launch_generic.cpp @@ -2,23 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "common/unified/base/kernel_launch.hpp" - - #include #include #include - #include - #include #include #include #include - +#include "common/unified/base/kernel_launch.hpp" #include "common/unified/base/kernel_launch_reduction.hpp" #include "common/unified/base/kernel_launch_solver.hpp" #include "core/base/array_access.hpp" diff --git a/test/base/timer.cpp b/test/base/timer.cpp index a817ddeef96..f2f0da113bf 100644 --- a/test/base/timer.cpp +++ b/test/base/timer.cpp @@ -2,15 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include - #include +#include #include "core/test/utils/assertions.hpp" #include "test/utils/executor.hpp" diff --git a/test/components/absolute_array_kernels.cpp b/test/components/absolute_array_kernels.cpp index 08dd52f35e3..a18ab1534c9 100644 --- a/test/components/absolute_array_kernels.cpp +++ b/test/components/absolute_array_kernels.cpp @@ -4,18 +4,14 @@ #include "core/components/absolute_array_kernels.hpp" - #include #include #include - #include - #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/components/fill_array_kernels.cpp b/test/components/fill_array_kernels.cpp index 3997c5830ea..122edb4dc27 100644 --- a/test/components/fill_array_kernels.cpp +++ b/test/components/fill_array_kernels.cpp @@ -4,18 +4,14 @@ #include "core/components/fill_array_kernels.hpp" - #include #include #include - #include - #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/components/format_conversion_kernels.cpp b/test/components/format_conversion_kernels.cpp index 053171ffbe2..3e783206af5 100644 --- a/test/components/format_conversion_kernels.cpp +++ b/test/components/format_conversion_kernels.cpp @@ -4,15 +4,12 @@ #include "core/components/format_conversion_kernels.hpp" - #include #include #include - #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/components/precision_conversion_kernels.cpp b/test/components/precision_conversion_kernels.cpp index f75aa948286..dcd6a0dba83 100644 --- a/test/components/precision_conversion_kernels.cpp +++ b/test/components/precision_conversion_kernels.cpp @@ -8,13 +8,10 @@ #include #include - #include - #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/components/prefix_sum_kernels.cpp b/test/components/prefix_sum_kernels.cpp index 73cb0c7874e..1ec97b6eadc 100644 --- a/test/components/prefix_sum_kernels.cpp +++ b/test/components/prefix_sum_kernels.cpp @@ -4,20 +4,16 @@ #include "core/components/prefix_sum_kernels.hpp" - #include #include #include #include #include - #include - #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/components/reduce_array_kernels.cpp b/test/components/reduce_array_kernels.cpp index dfc2e046c84..35c358099ad 100644 --- a/test/components/reduce_array_kernels.cpp +++ b/test/components/reduce_array_kernels.cpp @@ -4,18 +4,14 @@ #include "core/components/reduce_array_kernels.hpp" - #include #include #include - #include - #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/distributed/index_map_kernels.cpp b/test/distributed/index_map_kernels.cpp index cafd7b4da35..718fe84ce92 100644 --- a/test/distributed/index_map_kernels.cpp +++ b/test/distributed/index_map_kernels.cpp @@ -4,22 +4,18 @@ #include "core/distributed/index_map_kernels.hpp" - #include #include - #include #include - #include #include #include #include #include - #include "core/distributed/partition_kernels.hpp" #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp index 8445aee6a0e..3dcede95bfb 100644 --- a/test/distributed/matrix_kernels.cpp +++ b/test/distributed/matrix_kernels.cpp @@ -4,18 +4,14 @@ #include "core/distributed/matrix_kernels.hpp" - #include - #include #include - #include #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp index 9e985ffec9e..5b014625e7d 100644 --- a/test/distributed/partition_helper_kernels.cpp +++ b/test/distributed/partition_helper_kernels.cpp @@ -5,10 +5,8 @@ #include #include - #include - #include "core/base/iterator_factory.hpp" #include "core/distributed/partition_helpers_kernels.hpp" #include "core/test/utils.hpp" diff --git a/test/distributed/partition_kernels.cpp b/test/distributed/partition_kernels.cpp index e857e734154..b00d266170c 100644 --- a/test/distributed/partition_kernels.cpp +++ b/test/distributed/partition_kernels.cpp @@ -4,20 +4,16 @@ #include "core/distributed/partition_kernels.hpp" - #include #include #include - #include #include - #include #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/distributed/vector_kernels.cpp b/test/distributed/vector_kernels.cpp index 86faca6b2b2..294b72d861e 100644 --- a/test/distributed/vector_kernels.cpp +++ b/test/distributed/vector_kernels.cpp @@ -4,20 +4,16 @@ #include "core/distributed/vector_kernels.hpp" - #include #include #include - #include #include - #include #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp index c1d0a6c7336..b7c290eec17 100644 --- a/test/factorization/cholesky_kernels.cpp +++ b/test/factorization/cholesky_kernels.cpp @@ -4,20 +4,16 @@ #include "core/factorization/cholesky_kernels.hpp" - #include #include - #include - #include #include #include #include - #include "core/components/fill_array_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/elimination_forest.hpp" diff --git a/test/factorization/ic_kernels.cpp b/test/factorization/ic_kernels.cpp index c7cdbe5d435..ddb38575e03 100644 --- a/test/factorization/ic_kernels.cpp +++ b/test/factorization/ic_kernels.cpp @@ -7,15 +7,12 @@ #include #include - #include - #include #include #include - #include "core/test/utils.hpp" #include "core/test/utils/unsort_matrix.hpp" #include "matrices/config.hpp" diff --git a/test/factorization/ilu_kernels.cpp b/test/factorization/ilu_kernels.cpp index 8a5ced59041..bc7edeac57f 100644 --- a/test/factorization/ilu_kernels.cpp +++ b/test/factorization/ilu_kernels.cpp @@ -7,15 +7,12 @@ #include #include - #include - #include #include #include - #include "core/test/utils.hpp" #include "core/test/utils/unsort_matrix.hpp" #include "matrices/config.hpp" diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp index 0ea06bed506..035e938c7c8 100644 --- a/test/factorization/lu_kernels.cpp +++ b/test/factorization/lu_kernels.cpp @@ -4,15 +4,12 @@ #include "core/factorization/lu_kernels.hpp" - #include #include #include - #include - #include #include #include @@ -20,7 +17,6 @@ #include #include - #include "core/components/fill_array_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/cholesky_kernels.hpp" diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp index 40a40b5acf5..64541612343 100644 --- a/test/factorization/par_ic_kernels.cpp +++ b/test/factorization/par_ic_kernels.cpp @@ -4,24 +4,20 @@ #include "core/factorization/par_ic_kernels.hpp" - #include #include #include #include #include - #include - #include #include #include #include #include - #include "core/factorization/factorization_kernels.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp index 81d1dd83ffb..b157971ff90 100644 --- a/test/factorization/par_ict_kernels.cpp +++ b/test/factorization/par_ict_kernels.cpp @@ -4,24 +4,20 @@ #include "core/factorization/par_ict_kernels.hpp" - #include #include #include #include #include - #include - #include #include #include #include #include - #include "core/factorization/factorization_kernels.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp index 0d853af0745..a2f3f774ba7 100644 --- a/test/factorization/par_ilu_kernels.cpp +++ b/test/factorization/par_ilu_kernels.cpp @@ -4,24 +4,20 @@ #include "core/factorization/par_ilu_kernels.hpp" - #include #include #include #include #include - #include - #include #include #include #include #include - #include "core/base/iterator_factory.hpp" #include "core/factorization/factorization_kernels.hpp" #include "core/test/utils.hpp" diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp index 7d46f7979ac..6426e725fdf 100644 --- a/test/factorization/par_ilut_kernels.cpp +++ b/test/factorization/par_ilut_kernels.cpp @@ -4,24 +4,20 @@ #include "core/factorization/par_ilut_kernels.hpp" - #include #include #include #include #include - #include - #include #include #include #include #include - #include "core/factorization/factorization_kernels.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" diff --git a/test/log/profiler_hook.cpp b/test/log/profiler_hook.cpp index 656134ce981..6e0ed2933db 100644 --- a/test/log/profiler_hook.cpp +++ b/test/log/profiler_hook.cpp @@ -4,13 +4,10 @@ #include - #include - #include - #include "test/utils/executor.hpp" diff --git a/test/matrix/batch_csr_kernels.cpp b/test/matrix/batch_csr_kernels.cpp index 28f3ba65b98..d2a1b2d9aa4 100644 --- a/test/matrix/batch_csr_kernels.cpp +++ b/test/matrix/batch_csr_kernels.cpp @@ -4,20 +4,16 @@ #include "core/matrix/batch_csr_kernels.hpp" - #include #include - #include - #include #include #include #include - #include "core/base/batch_utilities.hpp" #include "core/test/utils.hpp" #include "core/test/utils/array_generator.hpp" diff --git a/test/matrix/batch_dense_kernels.cpp b/test/matrix/batch_dense_kernels.cpp index 4a6665b80c1..222ccf6e4b9 100644 --- a/test/matrix/batch_dense_kernels.cpp +++ b/test/matrix/batch_dense_kernels.cpp @@ -4,20 +4,16 @@ #include "core/matrix/batch_dense_kernels.hpp" - #include #include - #include - #include #include #include #include - #include "core/base/batch_utilities.hpp" #include "core/test/utils.hpp" #include "core/test/utils/array_generator.hpp" diff --git a/test/matrix/batch_ell_kernels.cpp b/test/matrix/batch_ell_kernels.cpp index 72406cac549..7edef2c4fb0 100644 --- a/test/matrix/batch_ell_kernels.cpp +++ b/test/matrix/batch_ell_kernels.cpp @@ -4,20 +4,16 @@ #include "core/matrix/batch_ell_kernels.hpp" - #include #include - #include - #include #include #include #include - #include "core/base/batch_utilities.hpp" #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" diff --git a/test/matrix/coo_kernels.cpp b/test/matrix/coo_kernels.cpp index 26bcdb8791b..3da488cf843 100644 --- a/test/matrix/coo_kernels.cpp +++ b/test/matrix/coo_kernels.cpp @@ -4,13 +4,10 @@ #include "core/matrix/coo_kernels.hpp" - #include - #include - #include #include #include @@ -19,7 +16,6 @@ #include #include - #include "core/test/utils.hpp" #include "core/test/utils/unsort_matrix.hpp" #include "test/utils/executor.hpp" diff --git a/test/matrix/csr_kernels.cpp b/test/matrix/csr_kernels.cpp index d3a7bb8f8e5..1a1f100e1fd 100644 --- a/test/matrix/csr_kernels.cpp +++ b/test/matrix/csr_kernels.cpp @@ -4,20 +4,16 @@ #include "core/matrix/csr_kernels.hpp" - #include #include #include #include - #include - #include #include - #include "common/unified/base/kernel_launch.hpp" #include "core/base/array_access.hpp" #include "core/components/fill_array_kernels.hpp" diff --git a/test/matrix/csr_kernels2.cpp b/test/matrix/csr_kernels2.cpp index 4ff8e749766..9272e99546e 100644 --- a/test/matrix/csr_kernels2.cpp +++ b/test/matrix/csr_kernels2.cpp @@ -2,19 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include - #include - #include #include #include +#include #include #include #include @@ -25,7 +21,6 @@ #include #include - #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/test/utils.hpp" diff --git a/test/matrix/dense_kernels.cpp b/test/matrix/dense_kernels.cpp index 56ca536187e..b8fd4d7900c 100644 --- a/test/matrix/dense_kernels.cpp +++ b/test/matrix/dense_kernels.cpp @@ -4,16 +4,13 @@ #include "core/matrix/dense_kernels.hpp" - #include #include #include #include - #include - #include #include #include @@ -27,7 +24,6 @@ #include #include - #include "core/components/fill_array_kernels.hpp" #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/matrix/diagonal_kernels.cpp b/test/matrix/diagonal_kernels.cpp index ffe1f4267e1..ca0a9eff205 100644 --- a/test/matrix/diagonal_kernels.cpp +++ b/test/matrix/diagonal_kernels.cpp @@ -4,20 +4,16 @@ #include "core/matrix/diagonal_kernels.hpp" - #include #include #include #include - #include - #include #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/matrix/ell_kernels.cpp b/test/matrix/ell_kernels.cpp index b61d97a0a7a..78af81ccafc 100644 --- a/test/matrix/ell_kernels.cpp +++ b/test/matrix/ell_kernels.cpp @@ -4,13 +4,10 @@ #include "core/matrix/ell_kernels.hpp" - #include - #include - #include #include #include @@ -19,7 +16,6 @@ #include #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/matrix/fbcsr_kernels.cpp b/test/matrix/fbcsr_kernels.cpp index 737bb926bfe..a3b85143bf0 100644 --- a/test/matrix/fbcsr_kernels.cpp +++ b/test/matrix/fbcsr_kernels.cpp @@ -4,17 +4,13 @@ #include "core/matrix/fbcsr_kernels.hpp" - #include - #include - #include #include - #include "core/test/matrix/fbcsr_sample.hpp" #include "core/test/utils.hpp" #include "core/test/utils/fb_matrix_generator.hpp" diff --git a/test/matrix/fft_kernels.cpp b/test/matrix/fft_kernels.cpp index ed186b1df60..056087fb9f3 100644 --- a/test/matrix/fft_kernels.cpp +++ b/test/matrix/fft_kernels.cpp @@ -4,17 +4,14 @@ #include - #include - #include #include #include #include #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/matrix/hybrid_kernels.cpp b/test/matrix/hybrid_kernels.cpp index 8fc3346d667..64179259deb 100644 --- a/test/matrix/hybrid_kernels.cpp +++ b/test/matrix/hybrid_kernels.cpp @@ -4,20 +4,16 @@ #include "core/matrix/hybrid_kernels.hpp" - #include - #include - #include #include #include #include #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp index 3a18b6700ab..7398b3edb06 100644 --- a/test/matrix/matrix.cpp +++ b/test/matrix/matrix.cpp @@ -7,10 +7,8 @@ #include #include - #include - #include #include #include @@ -23,7 +21,6 @@ #include #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/matrix/permutation_kernels.cpp b/test/matrix/permutation_kernels.cpp index 7c3aac97f55..e6324c15f1d 100644 --- a/test/matrix/permutation_kernels.cpp +++ b/test/matrix/permutation_kernels.cpp @@ -5,14 +5,11 @@ #include #include - #include - #include #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/matrix/scaled_permutation_kernels.cpp b/test/matrix/scaled_permutation_kernels.cpp index d81a40b6f63..7239862a8d9 100644 --- a/test/matrix/scaled_permutation_kernels.cpp +++ b/test/matrix/scaled_permutation_kernels.cpp @@ -5,13 +5,10 @@ #include #include - #include - #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/matrix/sellp_kernels.cpp b/test/matrix/sellp_kernels.cpp index ae6b9053e45..053369f7fa6 100644 --- a/test/matrix/sellp_kernels.cpp +++ b/test/matrix/sellp_kernels.cpp @@ -4,13 +4,10 @@ #include "core/matrix/sellp_kernels.hpp" - #include - #include - #include #include #include @@ -18,7 +15,6 @@ #include #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/matrix/sparsity_csr_kernels.cpp b/test/matrix/sparsity_csr_kernels.cpp index 010bd7faa86..8d3728f240d 100644 --- a/test/matrix/sparsity_csr_kernels.cpp +++ b/test/matrix/sparsity_csr_kernels.cpp @@ -4,21 +4,17 @@ #include "core/matrix/sparsity_csr_kernels.hpp" - #include #include - #include - #include #include #include #include #include - #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" #include "core/test/utils/matrix_generator.hpp" diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index d836eb008d9..8a201c78733 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -6,13 +6,10 @@ #include #include - #include - #include - #include #include #include @@ -22,7 +19,6 @@ #include #include - #include "core/test/utils.hpp" #include "test/utils/mpi/executor.hpp" diff --git a/test/mpi/multigrid/pgm.cpp b/test/mpi/multigrid/pgm.cpp index 8e72588128b..ccd7dd46b44 100644 --- a/test/mpi/multigrid/pgm.cpp +++ b/test/mpi/multigrid/pgm.cpp @@ -5,13 +5,10 @@ #include #include - #include - #include - #include #include #include @@ -21,7 +18,6 @@ #include #include - #include "core/test/utils.hpp" #include "test/utils/mpi/executor.hpp" diff --git a/test/mpi/partition_helpers.cpp b/test/mpi/partition_helpers.cpp index c19e3277510..b89295acc13 100644 --- a/test/mpi/partition_helpers.cpp +++ b/test/mpi/partition_helpers.cpp @@ -5,7 +5,6 @@ #include #include - #include "core/test/utils.hpp" #include "test/utils/mpi/executor.hpp" diff --git a/test/mpi/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp index 5c17254a970..cf29ea77016 100644 --- a/test/mpi/preconditioner/schwarz.cpp +++ b/test/mpi/preconditioner/schwarz.cpp @@ -6,13 +6,10 @@ #include #include - #include - #include - #include #include #include @@ -29,7 +26,6 @@ #include #include - #include "core/test/utils.hpp" #include "core/test/utils/matrix_generator.hpp" #include "core/utils/matrix_utils.hpp" diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp index e4e7077e4c7..4548dc5d6b7 100644 --- a/test/mpi/solver/solver.cpp +++ b/test/mpi/solver/solver.cpp @@ -8,10 +8,8 @@ #include #include - #include - #include #include #include @@ -30,7 +28,6 @@ #include #include - #include "core/test/utils.hpp" #include "core/test/utils/matrix_generator.hpp" #include "core/utils/matrix_utils.hpp" diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp index 11fe41aded2..3af6886dd84 100644 --- a/test/mpi/vector.cpp +++ b/test/mpi/vector.cpp @@ -5,13 +5,10 @@ #include #include - #include - #include - #include #include #include @@ -19,7 +16,6 @@ #include #include - #include "core/test/utils.hpp" #include "test/utils/mpi/executor.hpp" diff --git a/test/multigrid/fixed_coarsening_kernels.cpp b/test/multigrid/fixed_coarsening_kernels.cpp index 91699b8631e..0f3c7e56b2a 100644 --- a/test/multigrid/fixed_coarsening_kernels.cpp +++ b/test/multigrid/fixed_coarsening_kernels.cpp @@ -8,10 +8,8 @@ #include #include - #include - #include #include #include @@ -23,7 +21,6 @@ #include #include - #include "core/components/fill_array_kernels.hpp" #include "core/test/utils.hpp" #include "core/test/utils/matrix_generator.hpp" diff --git a/test/multigrid/pgm_kernels.cpp b/test/multigrid/pgm_kernels.cpp index 10e5cf01a7a..b0e3b338cbd 100644 --- a/test/multigrid/pgm_kernels.cpp +++ b/test/multigrid/pgm_kernels.cpp @@ -4,15 +4,12 @@ #include "core/multigrid/pgm_kernels.hpp" - #include #include #include - #include - #include #include #include @@ -24,7 +21,6 @@ #include #include - #include "core/test/utils.hpp" #include "core/test/utils/matrix_generator.hpp" #include "core/test/utils/unsort_matrix.hpp" diff --git a/test/preconditioner/batch_jacobi_kernels.cpp b/test/preconditioner/batch_jacobi_kernels.cpp index f8a1bd015ef..9bdbb015949 100644 --- a/test/preconditioner/batch_jacobi_kernels.cpp +++ b/test/preconditioner/batch_jacobi_kernels.cpp @@ -4,14 +4,11 @@ #include "core/preconditioner/batch_jacobi_kernels.hpp" - #include #include - #include - #include #include #include @@ -21,7 +18,6 @@ #include #include - #include "core/solver/batch_bicgstab_kernels.hpp" #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" diff --git a/test/preconditioner/isai_kernels.cpp b/test/preconditioner/isai_kernels.cpp index 6e737d31790..077379ab226 100644 --- a/test/preconditioner/isai_kernels.cpp +++ b/test/preconditioner/isai_kernels.cpp @@ -4,13 +4,10 @@ #include "core/preconditioner/isai_kernels.hpp" - #include - #include - #include #include #include @@ -19,7 +16,6 @@ #include #include - #include "core/test/utils.hpp" #include "matrices/config.hpp" #include "test/utils/executor.hpp" diff --git a/test/preconditioner/jacobi_kernels.cpp b/test/preconditioner/jacobi_kernels.cpp index d7586a9890e..5ae7c56e715 100644 --- a/test/preconditioner/jacobi_kernels.cpp +++ b/test/preconditioner/jacobi_kernels.cpp @@ -4,15 +4,12 @@ #include - #include - #include #include #include - #include "core/test/utils.hpp" #include "core/test/utils/unsort_matrix.hpp" #include "core/utils/matrix_utils.hpp" diff --git a/test/reorder/amd.cpp b/test/reorder/amd.cpp index 27639d11aad..8137ed8ad7e 100644 --- a/test/reorder/amd.cpp +++ b/test/reorder/amd.cpp @@ -7,14 +7,11 @@ #include #include - #include - #include #include - #include "core/test/utils.hpp" #include "core/test/utils/unsort_matrix.hpp" #include "matrices/config.hpp" diff --git a/test/reorder/mc64.cpp b/test/reorder/mc64.cpp index d4e4b176da7..0cc3ea33a3d 100644 --- a/test/reorder/mc64.cpp +++ b/test/reorder/mc64.cpp @@ -4,11 +4,9 @@ #include - #include #include - #include "core/test/utils/assertions.hpp" #include "test/utils/executor.hpp" diff --git a/test/reorder/nested_dissection.cpp b/test/reorder/nested_dissection.cpp index 93517b73f6d..2d11bdccb12 100644 --- a/test/reorder/nested_dissection.cpp +++ b/test/reorder/nested_dissection.cpp @@ -4,14 +4,11 @@ #include - #include - #include #include - #include "core/test/utils.hpp" #include "matrices/config.hpp" #include "test/utils/executor.hpp" diff --git a/test/reorder/rcm.cpp b/test/reorder/rcm.cpp index 923a5c1f10f..848d0deea5d 100644 --- a/test/reorder/rcm.cpp +++ b/test/reorder/rcm.cpp @@ -8,16 +8,13 @@ #include #include - #include - #include #include #include #include - #include "core/components/disjoint_sets.hpp" #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" diff --git a/test/solver/batch_bicgstab_kernels.cpp b/test/solver/batch_bicgstab_kernels.cpp index 14bca65e41f..8f4bfca00cc 100644 --- a/test/solver/batch_bicgstab_kernels.cpp +++ b/test/solver/batch_bicgstab_kernels.cpp @@ -4,14 +4,11 @@ #include "core/solver/batch_bicgstab_kernels.hpp" - #include #include - #include - #include #include #include @@ -19,7 +16,6 @@ #include #include - #include "core/base/batch_utilities.hpp" #include "core/matrix/batch_dense_kernels.hpp" #include "core/test/utils.hpp" diff --git a/test/solver/batch_cg_kernels.cpp b/test/solver/batch_cg_kernels.cpp index 7c013020686..7b5a85a1e5b 100644 --- a/test/solver/batch_cg_kernels.cpp +++ b/test/solver/batch_cg_kernels.cpp @@ -4,21 +4,17 @@ #include "core/solver/batch_cg_kernels.hpp" - #include #include - #include - #include #include #include #include #include - #include "core/base/batch_utilities.hpp" #include "core/matrix/batch_dense_kernels.hpp" #include "core/test/utils.hpp" diff --git a/test/solver/bicg_kernels.cpp b/test/solver/bicg_kernels.cpp index ab63b01f9cc..5f9dd818711 100644 --- a/test/solver/bicg_kernels.cpp +++ b/test/solver/bicg_kernels.cpp @@ -4,13 +4,10 @@ #include "core/solver/bicg_kernels.hpp" - #include - #include - #include #include #include @@ -19,7 +16,6 @@ #include #include - #include "core/test/utils.hpp" #include "core/utils/matrix_utils.hpp" #include "matrices/config.hpp" diff --git a/test/solver/bicgstab_kernels.cpp b/test/solver/bicgstab_kernels.cpp index 4f68edd6a8e..9548c99daf9 100644 --- a/test/solver/bicgstab_kernels.cpp +++ b/test/solver/bicgstab_kernels.cpp @@ -4,13 +4,10 @@ #include "core/solver/bicgstab_kernels.hpp" - #include - #include - #include #include #include @@ -20,7 +17,6 @@ #include #include - #include "core/test/utils.hpp" #include "core/utils/matrix_utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/solver/cb_gmres_kernels.cpp b/test/solver/cb_gmres_kernels.cpp index 3b5f5956c2e..45a752a2292 100644 --- a/test/solver/cb_gmres_kernels.cpp +++ b/test/solver/cb_gmres_kernels.cpp @@ -4,15 +4,12 @@ #include "core/solver/cb_gmres_kernels.hpp" - #include #include #include - #include - #include #include #include @@ -21,7 +18,6 @@ #include #include - #include "core/solver/cb_gmres_accessor.hpp" #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/solver/cg_kernels.cpp b/test/solver/cg_kernels.cpp index be9dc052314..b4408851da6 100644 --- a/test/solver/cg_kernels.cpp +++ b/test/solver/cg_kernels.cpp @@ -4,13 +4,10 @@ #include "core/solver/cg_kernels.hpp" - #include - #include - #include #include #include @@ -19,7 +16,6 @@ #include #include - #include "core/test/utils.hpp" #include "core/utils/matrix_utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/solver/cgs_kernels.cpp b/test/solver/cgs_kernels.cpp index 6c2bab293e3..392167d2106 100644 --- a/test/solver/cgs_kernels.cpp +++ b/test/solver/cgs_kernels.cpp @@ -4,13 +4,10 @@ #include "core/solver/cgs_kernels.hpp" - #include - #include - #include #include #include @@ -19,7 +16,6 @@ #include #include - #include "core/test/utils.hpp" #include "core/utils/matrix_utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/solver/direct.cpp b/test/solver/direct.cpp index be68593628f..c2e6c757f76 100644 --- a/test/solver/direct.cpp +++ b/test/solver/direct.cpp @@ -5,17 +5,14 @@ #include #include - #include - #include #include #include #include #include - #include "core/components/fill_array_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/cholesky_kernels.hpp" diff --git a/test/solver/fcg_kernels.cpp b/test/solver/fcg_kernels.cpp index f1f09f759bc..9ad2be9eb05 100644 --- a/test/solver/fcg_kernels.cpp +++ b/test/solver/fcg_kernels.cpp @@ -4,13 +4,10 @@ #include "core/solver/fcg_kernels.hpp" - #include - #include - #include #include #include @@ -19,7 +16,6 @@ #include #include - #include "core/test/utils.hpp" #include "core/utils/matrix_utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/solver/gcr_kernels.cpp b/test/solver/gcr_kernels.cpp index 7a00b3fed30..d26b5ef265c 100644 --- a/test/solver/gcr_kernels.cpp +++ b/test/solver/gcr_kernels.cpp @@ -4,13 +4,10 @@ #include "core/solver/gcr_kernels.hpp" - #include - #include - #include #include #include @@ -20,7 +17,6 @@ #include #include - #include "core/test/utils.hpp" #include "core/utils/matrix_utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp index 08259c91ce0..52ee885e29d 100644 --- a/test/solver/gmres_kernels.cpp +++ b/test/solver/gmres_kernels.cpp @@ -4,13 +4,10 @@ #include "core/solver/gmres_kernels.hpp" - #include - #include - #include #include #include @@ -20,7 +17,6 @@ #include #include - #include "core/solver/common_gmres_kernels.hpp" #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/solver/idr_kernels.cpp b/test/solver/idr_kernels.cpp index b165824dbe0..7afac1c2f33 100644 --- a/test/solver/idr_kernels.cpp +++ b/test/solver/idr_kernels.cpp @@ -4,11 +4,9 @@ #include "core/solver/idr_kernels.hpp" - #include #include - #include @@ -28,7 +26,6 @@ #include #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/solver/ir_kernels.cpp b/test/solver/ir_kernels.cpp index 7a8e84324bd..114dee3c06b 100644 --- a/test/solver/ir_kernels.cpp +++ b/test/solver/ir_kernels.cpp @@ -4,13 +4,10 @@ #include "core/solver/ir_kernels.hpp" - #include - #include - #include #include #include @@ -19,7 +16,6 @@ #include #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/solver/lower_trs_kernels.cpp b/test/solver/lower_trs_kernels.cpp index 1f99499a129..4bccf283faf 100644 --- a/test/solver/lower_trs_kernels.cpp +++ b/test/solver/lower_trs_kernels.cpp @@ -5,17 +5,14 @@ #include #include - #include - #include #include #include #include #include - #include "core/test/utils.hpp" #include "core/utils/matrix_utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/solver/multigrid_kernels.cpp b/test/solver/multigrid_kernels.cpp index 4b4b0157df5..894f4280346 100644 --- a/test/solver/multigrid_kernels.cpp +++ b/test/solver/multigrid_kernels.cpp @@ -4,13 +4,10 @@ #include "core/solver/multigrid_kernels.hpp" - #include - #include - #include #include #include @@ -18,7 +15,6 @@ #include #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/solver/solver.cpp b/test/solver/solver.cpp index fab351227f9..5b24234ce14 100644 --- a/test/solver/solver.cpp +++ b/test/solver/solver.cpp @@ -8,10 +8,8 @@ #include #include - #include - #include #include #include @@ -31,7 +29,6 @@ #include #include - #include "core/test/utils.hpp" #include "core/utils/matrix_utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/solver/upper_trs_kernels.cpp b/test/solver/upper_trs_kernels.cpp index 33d2196e097..c7041865dd1 100644 --- a/test/solver/upper_trs_kernels.cpp +++ b/test/solver/upper_trs_kernels.cpp @@ -5,17 +5,14 @@ #include #include - #include - #include #include #include #include #include - #include "core/test/utils.hpp" #include "core/utils/matrix_utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/stop/combined_kernels.cpp b/test/stop/combined_kernels.cpp index 8d9b0986c91..7e18a0c32aa 100644 --- a/test/stop/combined_kernels.cpp +++ b/test/stop/combined_kernels.cpp @@ -4,11 +4,9 @@ #include - #include #include - #include "test/utils/executor.hpp" diff --git a/test/stop/criterion_kernels.cpp b/test/stop/criterion_kernels.cpp index 91795d59bed..6b6094125ba 100644 --- a/test/stop/criterion_kernels.cpp +++ b/test/stop/criterion_kernels.cpp @@ -4,11 +4,9 @@ #include - #include #include - #include "test/utils/executor.hpp" diff --git a/test/stop/residual_norm_kernels.cpp b/test/stop/residual_norm_kernels.cpp index ed3b775a61c..7c3ddf6624e 100644 --- a/test/stop/residual_norm_kernels.cpp +++ b/test/stop/residual_norm_kernels.cpp @@ -4,11 +4,9 @@ #include - #include #include - #include "core/test/utils.hpp" #include "test/utils/executor.hpp" diff --git a/test/test_install/test_install.cpp b/test/test_install/test_install.cpp index 48252ef9bbe..2f4cdeda6e4 100644 --- a/test/test_install/test_install.cpp +++ b/test/test_install/test_install.cpp @@ -2,9 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include @@ -17,6 +14,8 @@ #include #include +#include + void assert_similar_matrices(gko::ptr_param> m1, gko::ptr_param> m2, diff --git a/test/tools/resource_file_generator.cpp b/test/tools/resource_file_generator.cpp index c1d4996267c..f6b35229a19 100644 --- a/test/tools/resource_file_generator.cpp +++ b/test/tools/resource_file_generator.cpp @@ -5,7 +5,6 @@ #include #include - #include diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index 21c40a70c0a..b31d1242f35 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -6,19 +6,14 @@ #define GKO_TEST_UTILS_EXECUTOR_HPP_ -#include - - #include #include - #include - +#include #include - #include "core/test/gtest/resources.hpp" diff --git a/test/utils/mpi/executor.hpp b/test/utils/mpi/executor.hpp index 180c31f37cf..199de02c054 100644 --- a/test/utils/mpi/executor.hpp +++ b/test/utils/mpi/executor.hpp @@ -6,18 +6,13 @@ #define GKO_TEST_UTILS_MPI_EXECUTOR_HPP_ -#include - - #include - #include - +#include #include - #include "test/utils/executor.hpp" From 099481fb2234dc29118bc9e542446863d3948466 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 16 Nov 2023 18:20:54 +0000 Subject: [PATCH 024/448] update contributing documentation --- CONTRIBUTING.md | 62 ++++++++++++++++++------------------------------- 1 file changed, 22 insertions(+), 40 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 04cf35aec8c..d460087b3c8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -176,12 +176,12 @@ improvements from code reviews. ### Automatic code formatting -Ginkgo uses [ClangFormat](https://clang.llvm.org/docs/ClangFormat.html) -(executable is usually named `clang-format`) and a custom `.clang-format` -configuration file (mostly based on ClangFormat's _Google_ style) to -automatically format your code. __Make sure you have ClangFormat set up and -running properly__ ( you should be able to run `make format` from Ginkgo's build -directory) before committing anything that will end up in a pull request against +Ginkgo uses [pre-commit](https://pre-commit.com/) to automatically apply +code formatting when committing changes to git. What formatting is applied +is managed through [ClangFormat](https://clang.llvm.org/docs/ClangFormat.html) +with a custom `.clang-format` configuration file (mostly based on ClangFormat's +_Google_ style). __Make sure you have pre-commit set up and running properly__ +before committing anything that will end up in a pull request against `ginkgo-project/ginkgo` repository. In addition, you should __never__ modify the `.clang-format` configuration file shipped with Ginkgo. E.g. if ClangFormat has trouble reading this file on your system, you should install a newer version of @@ -339,64 +339,53 @@ Thus, contributors should be aware of the following rules for blank lines: ### Include statement grouping +The concrete ordering will be done by `clang-format`. +Here are the rules that `clang-format` will follow. In general, all include statements should be present on the top of the file, -ordered in the following groups, with two blank lines between each group: +ordered in the following groups, with *one* blank lines between each group: -1. Related header file (e.g. `core/foo/bar.hpp` included in `core/foo/bar.cpp`, +1. Main header file (e.g. `core/foo/bar.hpp` included in `core/foo/bar.cpp`, or in the unit test`core/test/foo/bar.cpp`) 2. Standard library headers (e.g. `vector`) 3. Executor specific library headers (e.g. `omp.h`) 4. System third-party library headers (e.g. `papi.h`) -5. Local third-party library headers -6. Public Ginkgo headers -7. Private Ginkgo headers +5. Public Ginkgo headers +6. Local headers _Example_: A file `core/base/my_file.cpp` might have an include list like this: ```c++ -#include - +#include "ginkgo/core/base/my_file.hpp" #include #include #include - #include - #include - -#include "third_party/blas/cblas.hpp" -#include "third_party/lapack/lapack.hpp" - +#include +#include #include #include #include - #include "core/base/my_file_kernels.hpp" ``` #### Main header -This section presents general rules used to define the main header attributed to -the file. In the previous example, this would be ` #include -`. +This section presents the handling of the main header attributed to a file. +For a given file, the main header is the header that contains the declarations of the +functions, classes, etc., which are implemented in this file. +In the previous example, this would be ` #include "ginkgo/core/base/my_file.hpp"`. +The `clang-format` tool figures out the main header. The only intervention form +a contributor is to *always* include the main header using `"..."`. -General rules: -1. Some fixed main header. -2. components: - - with `_kernel` suffix looks for the header in the same folder. - - without `_kernel` suffix looks for the header in `core`. -3. `test/utils`: looks for the header in `core` -4. `core`: looks for the header in `ginkgo` -5. `test` or `base`: looks for the header in `ginkgo/core` -6. others: looks for the header in `core` +Please note that this only applies to implementation files, so files ending in `.cpp` or `.cu`. -_Note_: Please see the detail in the `dev_tools/scripts/config`. #### Some general comments. @@ -405,13 +394,6 @@ _Note_: Please see the detail in the `dev_tools/scripts/config`. When compiling with `GINKGO_CHECK_CIRCULAR_DEPS` enabled, this property is explicitly checked. 3. The recommendations of the `iwyu` (Include what you use) tool can be used to make sure that the headers are self-sufficient and that the compiled files ( `.cu`, `.cpp`, `.hip.cpp` ) include only what they use. A [CI pipeline](https://gitlab.com/ginkgo-project/ginkgo-public-ci/-/jobs/584358356) is available that runs with the `iwyu` tool. Please be aware that this tool can be incorrect in some cases. -#### Automatic header arrangement - -1. `dev_tools/script/format_header.sh` will take care of the group/sorting of - headers according to this guideline. -2. `make format_header` arranges the header of the modified files in the branch. -3. `make format_header_all` arranges the header of all files. - ### Other Code Formatting not handled by ClangFormat From 95cff7495dd28fb967b1bd449924a16057077105 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 14 Jun 2024 14:48:37 +0200 Subject: [PATCH 025/448] [test] fix whitespace bug in profiler_hook test --- core/test/log/profiler_hook.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/core/test/log/profiler_hook.cpp b/core/test/log/profiler_hook.cpp index 40bd6394475..ce74e526879 100644 --- a/core/test/log/profiler_hook.cpp +++ b/core/test/log/profiler_hook.cpp @@ -387,8 +387,9 @@ TEST(ProfilerHookTableSummaryWriter, SummaryWorks) entries.push_back({"medium", 1ms, 500us, 4}); // check division by count entries.push_back({"long", 120s, 60s, 1}); entries.push_back({"eternal", 24h, 24h, 1}); + // clang-format off const auto expected = R"(Test header -Overhead estimate 1.0 s +Overhead estimate 1.0 ns | name | total | total (self) | count | avg | avg (self) | |----------|-------:|-------------:|------:|---------:|-----------:| | eternal | 1.0 d | 1.0 d | 1 | 1.0 d | 1.0 d | @@ -398,8 +399,9 @@ Overhead estimate 1.0 s | short | 1.0 ns | 0.0 ns | 1 | 1.0 ns | 0.0 ns | | empty | 0.0 ns | 0.0 ns | 0 | 0.0 ns | 0.0 ns | )"; + // clang-format on - writer.write(entries, 1s); + writer.write(entries, 1ns); ASSERT_EQ(ss.str(), expected); } @@ -422,6 +424,7 @@ TEST(ProfilerHookTableSummaryWriter, NestedSummaryWorks) 2, {ProfilerHook::nested_summary_entry{"child", 100ns, 2, {}}}}, ProfilerHook::nested_summary_entry{"baz", 1ns, 2, {}}}}; + // clang-format off const auto expected = R"(Test header Overhead estimate 1.0 ns | name | total | fraction | count | avg | @@ -434,6 +437,7 @@ Overhead estimate 1.0 ns | foo | 100.0 ns | 5.0 % | 5 | 20.0 ns | | baz | 1.0 ns | 0.1 % | 2 | 0.0 ns | )"; + // clang-format on writer.write_nested(entry, 1ns); From 648b10e995588fe485cbc1cf47656893a2d09fcb Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 28 Jun 2024 09:54:00 +0200 Subject: [PATCH 026/448] Revert "adds script to change main include to use "" instead of <>" This reverts commit a9880c2ae61571edc4846887cc299ee7a2da3850. --- dev_tools/scripts/change-main-include.py | 60 ------------------------ 1 file changed, 60 deletions(-) delete mode 100755 dev_tools/scripts/change-main-include.py diff --git a/dev_tools/scripts/change-main-include.py b/dev_tools/scripts/change-main-include.py deleted file mode 100755 index 7ee5e8cd922..00000000000 --- a/dev_tools/scripts/change-main-include.py +++ /dev/null @@ -1,60 +0,0 @@ -#! /usr/bin/env python3 -import collections -import sys -import re - -files = sys.argv[1:] - -test_subdirectories = [ - "base", "config", "distributed", "factorization", - "log", "matrix", "multigrid", "preconditioner", - "reorder", "solver", "stop", "synthesizer" -] - -false_positives = [ - "test/utils/executor.hpp", - "test/utils/mpi/executor.hpp" -] - - -for filename in files: - suffix = re.compile(r"(\.cpp|\.cu|\.inc)$") - main_include_re = re.compile(r"#include\s+]+)>") - - Match = collections.namedtuple("Match", ["idx", "line"]) - - if not suffix.search(filename): - continue - - if any(f"test/{subdir}" in filename for subdir in test_subdirectories): - continue - - if any(filename.endswith(fp) for fp in false_positives): - continue - - with open(filename, 'r') as file: - content = file.readlines() - - try: - first_include = next(Match(idx=i, line=l) for i, l in enumerate(content) if l.startswith("#include")) - except: - first_include = Match(idx=-1, line="") - if "', '"') - with open(filename, 'w') as file: - file.writelines(content) From 7afbbf4e377ca639e25bba21d4c783668c0b8a2f Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 28 Jun 2024 11:01:49 +0200 Subject: [PATCH 027/448] [cuda] rem deprecated shmem config guard --- cuda/base/kernel_config.hpp | 59 --------------------------- cuda/solver/batch_bicgstab_kernels.cu | 4 -- cuda/solver/batch_cg_kernels.cu | 4 -- 3 files changed, 67 deletions(-) delete mode 100644 cuda/base/kernel_config.hpp diff --git a/cuda/base/kernel_config.hpp b/cuda/base/kernel_config.hpp deleted file mode 100644 index f0821a42976..00000000000 --- a/cuda/base/kernel_config.hpp +++ /dev/null @@ -1,59 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_BASE_KERNEL_CONFIG_HPP_ -#define GKO_CUDA_BASE_KERNEL_CONFIG_HPP_ - - -#include - -#include - - -namespace gko { -namespace kernels { -namespace cuda { -namespace detail { - - -template -class shared_memory_config_guard { -public: - using value_type = ValueType; - shared_memory_config_guard() : original_config_{} - { - GKO_ASSERT_NO_CUDA_ERRORS( - cudaDeviceGetSharedMemConfig(&original_config_)); - - if (sizeof(value_type) == 4) { - GKO_ASSERT_NO_CUDA_ERRORS( - cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte)); - } else if (sizeof(value_type) % 8 == 0) { - GKO_ASSERT_NO_CUDA_ERRORS( - cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte)); - } else { - GKO_ASSERT_NO_CUDA_ERRORS( - cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeDefault)); - } - } - - - ~shared_memory_config_guard() - { - // No need to exit or throw if we cant set the value back. - cudaDeviceSetSharedMemConfig(original_config_); - } - -private: - cudaSharedMemConfig original_config_; -}; - - -} // namespace detail -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_BASE_KERNEL_CONFIG_HPP_ diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index b6ae74a5064..28efaf07475 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -18,7 +18,6 @@ #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/kernel_config.hpp" #include "cuda/base/thrust.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" @@ -143,9 +142,6 @@ public: constexpr int align_multiple = 8; const int padded_num_rows = ceildiv(mat.num_rows, align_multiple) * align_multiple; - auto shem_guard = - gko::kernels::cuda::detail::shared_memory_config_guard< - value_type>(); const int shmem_per_blk = get_max_dynamic_shared_memory(exec_); diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index 5425bd9cd9c..cff72652629 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -17,7 +17,6 @@ #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/kernel_config.hpp" #include "cuda/base/thrust.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" @@ -141,9 +140,6 @@ public: constexpr int align_multiple = 8; const int padded_num_rows = ceildiv(mat.num_rows, align_multiple) * align_multiple; - auto shem_guard = - gko::kernels::cuda::detail::shared_memory_config_guard< - value_type>(); const int shmem_per_blk = get_max_dynamic_shared_memory(exec_); From 81ebe46b76f8d4f014651b71a480243e3f5d3eae Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 19 May 2024 01:43:14 +0200 Subject: [PATCH 028/448] add executor description --- core/device_hooks/cuda_hooks.cpp | 3 ++ core/device_hooks/dpcpp_hooks.cpp | 3 ++ core/device_hooks/hip_hooks.cpp | 3 ++ core/device_hooks/omp_hooks.cpp | 3 ++ core/test/gtest/environments.hpp | 56 ++++++--------------------- cuda/base/executor.cpp | 9 +++++ dpcpp/base/executor.dp.cpp | 11 ++++++ hip/base/executor.hip.cpp | 9 +++++ include/ginkgo/core/base/executor.hpp | 13 +++++++ omp/base/executor.cpp | 7 ++++ 10 files changed, 73 insertions(+), 44 deletions(-) diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp index abda9e4e0f6..4124ac2bea5 100644 --- a/core/device_hooks/cuda_hooks.cpp +++ b/core/device_hooks/cuda_hooks.cpp @@ -148,6 +148,9 @@ scoped_device_id_guard CudaExecutor::get_scoped_device_id_guard() const GKO_NOT_COMPILED(cuda); +std::string CudaExecutor::get_description() const GKO_NOT_COMPILED(cuda); + + std::string CudaError::get_error(int64) { return "ginkgo CUDA module is not compiled"; diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp index 6cd86581998..470fd9befc4 100644 --- a/core/device_hooks/dpcpp_hooks.cpp +++ b/core/device_hooks/dpcpp_hooks.cpp @@ -91,6 +91,9 @@ scoped_device_id_guard DpcppExecutor::get_scoped_device_id_guard() const GKO_NOT_COMPILED(dpcpp); +std::string DpcppExecutor::get_description() const GKO_NOT_COMPILED(dpcpp); + + int DpcppExecutor::get_num_devices(std::string) { return 0; } diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp index 573fb37b8f0..7f3497e8020 100644 --- a/core/device_hooks/hip_hooks.cpp +++ b/core/device_hooks/hip_hooks.cpp @@ -147,6 +147,9 @@ scoped_device_id_guard HipExecutor::get_scoped_device_id_guard() const GKO_NOT_COMPILED(hip); +std::string HipExecutor::get_description() const GKO_NOT_COMPILED(hip); + + std::string HipError::get_error(int64) { return "ginkgo HIP module is not compiled"; diff --git a/core/device_hooks/omp_hooks.cpp b/core/device_hooks/omp_hooks.cpp index c371f8ff767..33025006a4d 100644 --- a/core/device_hooks/omp_hooks.cpp +++ b/core/device_hooks/omp_hooks.cpp @@ -24,6 +24,9 @@ scoped_device_id_guard::scoped_device_id_guard(const OmpExecutor* exec, GKO_NOT_COMPILED(omp); +std::string OmpExecutor::get_description() const GKO_NOT_COMPILED(omp); + + int OmpExecutor::get_num_omp_threads() { return 1; } diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp index 01250c41929..1268b92c4c1 100644 --- a/core/test/gtest/environments.hpp +++ b/core/test/gtest/environments.hpp @@ -17,6 +17,7 @@ #include #include "core/test/gtest/resources.hpp" +#include "test/utils/executor.hpp" #ifdef GKO_COMPILING_OMP @@ -43,60 +44,27 @@ class DeviceEnvironment : public ::testing::Environment { public: explicit DeviceEnvironment(int rank) : rank_(rank) { print_environment(); } -#ifdef GKO_COMPILING_OMP void print_environment() const { + auto ref = gko::ReferenceExecutor::create(); +#ifdef GKO_COMPILING_OMP if (ResourceEnvironment::omp_threads > 0) { omp_set_num_threads(ResourceEnvironment::omp_threads); } - std::stringstream ss; - ss << "Rank " << rank_ << ": OMP threads " << omp_get_max_threads() - << std::endl; - std::cerr << ss.str(); - } + std::shared_ptr exec; #elif defined(GKO_COMPILING_CUDA) - void print_environment() const - { - auto device_id = ResourceEnvironment::cuda_device_id; - std::stringstream ss; - ss << "Rank " << rank_ << ": CUDA device " - << gko::kernels::cuda::get_device_name(device_id) << " ID " - << device_id << std::endl; - std::cerr << ss.str(); - } - - void TearDown() override - { - gko::kernels::cuda::reset_device(ResourceEnvironment::cuda_device_id); - } + std::shared_ptr exec; #elif defined(GKO_COMPILING_HIP) - void print_environment() const - { - auto device_id = ResourceEnvironment::hip_device_id; - std::stringstream ss; - ss << "Rank " << rank_ << ": HIP device " - << gko::kernels::hip::get_device_name(device_id) << " ID " - << device_id << std::endl; - std::cerr << ss.str(); - } - - void TearDown() override - { - gko::kernels::hip::reset_device(ResourceEnvironment::hip_device_id); - } + std::shared_ptr exec; #elif defined(GKO_COMPILING_DPCPP) - void print_environment() const - { - auto device_id = ResourceEnvironment::sycl_device_id; - std::stringstream ss; - ss << "Rank " << rank_ << ": SYCL device " - << gko::kernels::dpcpp::get_device_name(device_id) << " ID " - << device_id << std::endl; - std::cerr << ss.str(); - } + std::shared_ptr exec; #else - void print_environment() const {} + std::shared_ptr exec; #endif + init_executor(ref, exec); + std::cerr << "Rank " << rank_ << ": " << exec->get_description() + << std::endl; + } private: int rank_; diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index 1b1410ca8bb..caf5269fa3d 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -19,6 +19,7 @@ #include "common/cuda_hip/base/config.hpp" #include "cuda/base/cublas_bindings.hpp" #include "cuda/base/cusparse_handle.hpp" +#include "cuda/base/device.hpp" #include "cuda/base/scoped_device_id.hpp" @@ -178,6 +179,14 @@ scoped_device_id_guard CudaExecutor::get_scoped_device_id_guard() const } +std::string CudaExecutor::get_description() const +{ + return "CudaExecutor on device " + std::to_string(this->get_device_id()) + + " (" + gko::kernels::cuda::get_device_name(this->get_device_id()) + + ") with host " + this->get_master()->get_description(); +} + + int CudaExecutor::get_num_devices() { int deviceCount = 0; diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp index 29f0810d9d9..8a7460f6bcd 100644 --- a/dpcpp/base/executor.dp.cpp +++ b/dpcpp/base/executor.dp.cpp @@ -162,6 +162,17 @@ scoped_device_id_guard DpcppExecutor::get_scoped_device_id_guard() const } +std::string DpcppExecutor::get_description() const +{ + return "DpcppExecutor on device " + std::to_string(this->get_device_id()) + + " (" + + this->get_queue() + ->get_device() + .get_info() + + ") with host " + this->get_master()->get_description(); +} + + int DpcppExecutor::get_num_devices(std::string device_type) { return detail::get_devices(device_type).size(); diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp index 9e09912c5c9..d4b1d614681 100644 --- a/hip/base/executor.hip.cpp +++ b/hip/base/executor.hip.cpp @@ -12,6 +12,7 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "hip/base/device.hpp" #include "hip/base/hipblas_bindings.hip.hpp" #include "hip/base/hipsparse_bindings.hip.hpp" #include "hip/base/scoped_device_id.hip.hpp" @@ -173,6 +174,14 @@ scoped_device_id_guard HipExecutor::get_scoped_device_id_guard() const } +std::string HipExecutor::get_description() const +{ + return "HipExecutor on device " + std::to_string(this->get_device_id()) + + " (" + gko::kernels::hip::get_device_name(this->get_device_id()) + + ") with host " + this->get_master()->get_description(); +} + + int HipExecutor::get_num_devices() { int deviceCount = 0; diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 0d592485c1c..95373b3e847 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -865,6 +865,9 @@ class Executor : public log::EnableLogging { virtual scoped_device_id_guard get_scoped_device_id_guard() const = 0; + /** @return a textual representation of the executor and its device. */ + virtual std::string get_description() const = 0; + protected: /** * A struct that abstracts the executor info for different executors @@ -1368,6 +1371,8 @@ class OmpExecutor : public detail::ExecutorBase, scoped_device_id_guard get_scoped_device_id_guard() const override; + std::string get_description() const override; + protected: OmpExecutor(std::shared_ptr alloc) : alloc_{std::move(alloc)} @@ -1426,6 +1431,8 @@ class ReferenceExecutor : public OmpExecutor { return {this, 0}; } + std::string get_description() const override { return "ReferenceExecutor"; } + void run(const Operation& op) const override { this->template log(this, &op); @@ -1532,6 +1539,8 @@ class CudaExecutor : public detail::ExecutorBase, scoped_device_id_guard get_scoped_device_id_guard() const override; + std::string get_description() const override; + /** * Get the CUDA device id of the device associated to this executor. */ @@ -1752,6 +1761,8 @@ class HipExecutor : public detail::ExecutorBase, scoped_device_id_guard get_scoped_device_id_guard() const override; + std::string get_description() const override; + /** * Get the HIP device id of the device associated to this executor. */ @@ -1953,6 +1964,8 @@ class DpcppExecutor : public detail::ExecutorBase, scoped_device_id_guard get_scoped_device_id_guard() const override; + std::string get_description() const override; + /** * Get the DPCPP device id of the device associated to this executor. * diff --git a/omp/base/executor.cpp b/omp/base/executor.cpp index 5e846946e5e..7505b78ede6 100644 --- a/omp/base/executor.cpp +++ b/omp/base/executor.cpp @@ -20,4 +20,11 @@ int OmpExecutor::get_num_omp_threads() } +std::string OmpExecutor::get_description() const +{ + return "OmpExecutor (" + std::to_string(this->get_num_omp_threads()) + + " threads)"; +} + + } // namespace gko From d1c30d271a830831fc2c9b2de2f50f189df07e01 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 19 May 2024 01:43:48 +0200 Subject: [PATCH 029/448] use Executor:::get_description() in benchmarks --- benchmark/blas/blas.cpp | 2 +- benchmark/blas/distributed/multi_vector.cpp | 6 +++--- benchmark/conversion/conversion.cpp | 2 +- benchmark/preconditioner/preconditioner.cpp | 2 +- benchmark/solver/distributed/solver.cpp | 2 +- benchmark/solver/solver.cpp | 2 +- benchmark/sparse_blas/sparse_blas.cpp | 2 +- benchmark/spmv/distributed/spmv.cpp | 6 +++--- benchmark/spmv/spmv.cpp | 3 ++- benchmark/test/reference/blas.profile.stderr | 2 +- benchmark/test/reference/blas.simple.stderr | 2 +- benchmark/test/reference/conversion.all.stderr | 2 +- benchmark/test/reference/conversion.matrix.stderr | 2 +- benchmark/test/reference/conversion.profile.stderr | 2 +- benchmark/test/reference/conversion.simple.stderr | 2 +- benchmark/test/reference/distributed_solver.matrix.stderr | 2 +- benchmark/test/reference/distributed_solver.profile.stderr | 2 +- benchmark/test/reference/distributed_solver.simple.stderr | 2 +- .../test/reference/multi_vector_distributed.profile.stderr | 2 +- .../test/reference/multi_vector_distributed.simple.stderr | 2 +- benchmark/test/reference/preconditioner.matrix.stderr | 2 +- benchmark/test/reference/preconditioner.precond.stderr | 2 +- benchmark/test/reference/preconditioner.profile.stderr | 2 +- benchmark/test/reference/preconditioner.reordered.stderr | 2 +- benchmark/test/reference/preconditioner.simple.stderr | 2 +- benchmark/test/reference/solver.matrix.stderr | 2 +- benchmark/test/reference/solver.profile.stderr | 2 +- benchmark/test/reference/solver.reordered.stderr | 2 +- benchmark/test/reference/solver.simple.stderr | 2 +- benchmark/test/reference/sparse_blas.matrix.stderr | 2 +- benchmark/test/reference/sparse_blas.profile.stderr | 2 +- benchmark/test/reference/sparse_blas.reordered.stderr | 2 +- benchmark/test/reference/sparse_blas.simple.stderr | 2 +- benchmark/test/reference/spmv.matrix.stderr | 2 +- benchmark/test/reference/spmv.profile.stderr | 2 +- benchmark/test/reference/spmv.reordered.stderr | 2 +- benchmark/test/reference/spmv.simple.stderr | 2 +- benchmark/test/reference/spmv_distributed.profile.stderr | 2 +- benchmark/test/reference/spmv_distributed.simple.stderr | 2 +- benchmark/utils/general.hpp | 6 +++--- 40 files changed, 47 insertions(+), 46 deletions(-) diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp index 57e0152d824..2a682f49917 100644 --- a/benchmark/blas/blas.cpp +++ b/benchmark/blas/blas.cpp @@ -104,8 +104,8 @@ Parameters for a benchmark case are: initialize_argument_parsing(&argc, &argv, header, format); std::string extra_information = "The operations are " + FLAGS_operations; - print_general_information(extra_information); auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer); + print_general_information(extra_information, exec); auto test_cases = json::parse(get_input_stream()); diff --git a/benchmark/blas/distributed/multi_vector.cpp b/benchmark/blas/distributed/multi_vector.cpp index a4be6c502c1..e0dfa36fb19 100644 --- a/benchmark/blas/distributed/multi_vector.cpp +++ b/benchmark/blas/distributed/multi_vector.cpp @@ -41,14 +41,14 @@ Parameters for a benchmark case are: std::string format = Generator::get_example_config(); initialize_argument_parsing(&argc, &argv, header, format, do_print); + auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get()); + if (do_print) { std::string extra_information = "The operations are " + FLAGS_operations; - print_general_information(extra_information); + print_general_information(extra_information, exec); } - auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get()); - std::string json_input = broadcast_json_input(get_input_stream(), comm); auto test_cases = json::parse(json_input); diff --git a/benchmark/conversion/conversion.cpp b/benchmark/conversion/conversion.cpp index 17d2ac48e47..59c052b11e7 100644 --- a/benchmark/conversion/conversion.cpp +++ b/benchmark/conversion/conversion.cpp @@ -163,9 +163,9 @@ int main(int argc, char* argv[]) std::string extra_information = std::string() + "The formats are " + FLAGS_formats; - print_general_information(extra_information); auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer); + print_general_information(extra_information, exec); auto formats = split(FLAGS_formats, ','); auto test_cases = json::parse(get_input_stream()); diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp index 3c737d67d7b..b9dfe1d8369 100644 --- a/benchmark/preconditioner/preconditioner.cpp +++ b/benchmark/preconditioner/preconditioner.cpp @@ -275,9 +275,9 @@ int main(int argc, char* argv[]) std::string extra_information = "Running with preconditioners: " + FLAGS_preconditioners; - print_general_information(extra_information); auto exec = get_executor(FLAGS_gpu_timer); + print_general_information(extra_information, exec); auto& engine = get_engine(); auto preconditioners = split(FLAGS_preconditioners, ','); diff --git a/benchmark/solver/distributed/solver.cpp b/benchmark/solver/distributed/solver.cpp index 196bae5331b..9605d5bbf8f 100644 --- a/benchmark/solver/distributed/solver.cpp +++ b/benchmark/solver/distributed/solver.cpp @@ -91,7 +91,7 @@ int main(int argc, char* argv[]) ss_rel_res_goal.str() + "\nThe number of right hand sides is " + std::to_string(FLAGS_nrhs); if (do_print) { - print_general_information(extra_information); + print_general_information(extra_information, exec); } std::set supported_solvers = {"cg", "fcg", "cgs", "bicgstab", diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp index 94956cadd21..548c843f898 100644 --- a/benchmark/solver/solver.cpp +++ b/benchmark/solver/solver.cpp @@ -43,9 +43,9 @@ int main(int argc, char* argv[]) std::to_string(FLAGS_max_iters) + " iterations and residual goal of " + ss_rel_res_goal.str() + "\nThe number of right hand sides is " + std::to_string(FLAGS_nrhs); - print_general_information(extra_information); auto exec = get_executor(FLAGS_gpu_timer); + print_general_information(extra_information, exec); json test_cases; if (!FLAGS_overhead) { diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp index 3897689ca11..cfd87b53439 100644 --- a/benchmark/sparse_blas/sparse_blas.cpp +++ b/benchmark/sparse_blas/sparse_blas.cpp @@ -166,7 +166,7 @@ int main(int argc, char* argv[]) auto test_cases = json::parse(get_input_stream()); std::string extra_information = "The operations are " + FLAGS_operations; - print_general_information(extra_information); + print_general_information(extra_information, exec); run_test_cases(SparseBlasBenchmark{}, exec, get_timer(exec, FLAGS_gpu_timer), test_cases); diff --git a/benchmark/spmv/distributed/spmv.cpp b/benchmark/spmv/distributed/spmv.cpp index 2c2e0f57b0e..135e7e4e4f0 100644 --- a/benchmark/spmv/distributed/spmv.cpp +++ b/benchmark/spmv/distributed/spmv.cpp @@ -49,16 +49,16 @@ int main(int argc, char* argv[]) initialize_argument_parsing_matrix(&argc, &argv, header, format, "", do_print); + auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get()); + if (do_print) { std::string extra_information = "The formats are [" + FLAGS_local_formats + "]x[" + FLAGS_non_local_formats + "]\n" + "The number of right hand sides is " + std::to_string(FLAGS_nrhs); - print_general_information(extra_information); + print_general_information(extra_information, exec); } - auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get()); - auto local_formats = split(FLAGS_local_formats, ','); auto non_local_formats = split(FLAGS_non_local_formats, ','); std::vector formats; diff --git a/benchmark/spmv/spmv.cpp b/benchmark/spmv/spmv.cpp index 960921257e3..40a48232be0 100644 --- a/benchmark/spmv/spmv.cpp +++ b/benchmark/spmv/spmv.cpp @@ -26,10 +26,11 @@ int main(int argc, char* argv[]) std::string extra_information = "The formats are " + FLAGS_formats + "\nThe number of right hand sides is " + std::to_string(FLAGS_nrhs); - print_general_information(extra_information); auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer); + print_general_information(extra_information, exec); + auto test_cases = json::parse(get_input_stream()); run_test_cases(SpmvBenchmark{Generator{}, split(FLAGS_formats)}, diff --git a/benchmark/test/reference/blas.profile.stderr b/benchmark/test/reference/blas.profile.stderr index 8cb03d61ab0..b4a132b8ebd 100644 --- a/benchmark/test/reference/blas.profile.stderr +++ b/benchmark/test/reference/blas.profile.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 The operations are copy,axpy,scal diff --git a/benchmark/test/reference/blas.simple.stderr b/benchmark/test/reference/blas.simple.stderr index 3ed5cf3784e..ff505a3f1c9 100644 --- a/benchmark/test/reference/blas.simple.stderr +++ b/benchmark/test/reference/blas.simple.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The operations are copy,axpy,scal diff --git a/benchmark/test/reference/conversion.all.stderr b/benchmark/test/reference/conversion.all.stderr index d1759e1021e..ed52cf42fb4 100644 --- a/benchmark/test/reference/conversion.all.stderr +++ b/benchmark/test/reference/conversion.all.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo,csr,ell,sellp,hybrid diff --git a/benchmark/test/reference/conversion.matrix.stderr b/benchmark/test/reference/conversion.matrix.stderr index 2f9d717e268..2ad5c0a1545 100644 --- a/benchmark/test/reference/conversion.matrix.stderr +++ b/benchmark/test/reference/conversion.matrix.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo,csr diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr index 7b543f87314..561680cc885 100644 --- a/benchmark/test/reference/conversion.profile.stderr +++ b/benchmark/test/reference/conversion.profile.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 The formats are coo,csr diff --git a/benchmark/test/reference/conversion.simple.stderr b/benchmark/test/reference/conversion.simple.stderr index 76c52df1d56..23a27c4372a 100644 --- a/benchmark/test/reference/conversion.simple.stderr +++ b/benchmark/test/reference/conversion.simple.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo,csr diff --git a/benchmark/test/reference/distributed_solver.matrix.stderr b/benchmark/test/reference/distributed_solver.matrix.stderr index 8fa38bfb7ed..dddd27e145a 100644 --- a/benchmark/test/reference/distributed_solver.matrix.stderr +++ b/benchmark/test/reference/distributed_solver.matrix.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr index 907ff8a9c98..cf5006ab785 100644 --- a/benchmark/test/reference/distributed_solver.profile.stderr +++ b/benchmark/test/reference/distributed_solver.profile.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 diff --git a/benchmark/test/reference/distributed_solver.simple.stderr b/benchmark/test/reference/distributed_solver.simple.stderr index 952be67e93c..9d4b1f7094e 100644 --- a/benchmark/test/reference/distributed_solver.simple.stderr +++ b/benchmark/test/reference/distributed_solver.simple.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stderr b/benchmark/test/reference/multi_vector_distributed.profile.stderr index 39fc91b3fed..10de82cae01 100644 --- a/benchmark/test/reference/multi_vector_distributed.profile.stderr +++ b/benchmark/test/reference/multi_vector_distributed.profile.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 The operations are copy,axpy,scal diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stderr b/benchmark/test/reference/multi_vector_distributed.simple.stderr index 3ed5cf3784e..ff505a3f1c9 100644 --- a/benchmark/test/reference/multi_vector_distributed.simple.stderr +++ b/benchmark/test/reference/multi_vector_distributed.simple.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The operations are copy,axpy,scal diff --git a/benchmark/test/reference/preconditioner.matrix.stderr b/benchmark/test/reference/preconditioner.matrix.stderr index 12af18b503e..ad79bbffc34 100644 --- a/benchmark/test/reference/preconditioner.matrix.stderr +++ b/benchmark/test/reference/preconditioner.matrix.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 Running with preconditioners: none diff --git a/benchmark/test/reference/preconditioner.precond.stderr b/benchmark/test/reference/preconditioner.precond.stderr index 52c54ffdd65..49bb9820f76 100644 --- a/benchmark/test/reference/preconditioner.precond.stderr +++ b/benchmark/test/reference/preconditioner.precond.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 Running with preconditioners: jacobi diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr index 56cfcc39c89..34cf27acbc6 100644 --- a/benchmark/test/reference/preconditioner.profile.stderr +++ b/benchmark/test/reference/preconditioner.profile.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running with preconditioners: none diff --git a/benchmark/test/reference/preconditioner.reordered.stderr b/benchmark/test/reference/preconditioner.reordered.stderr index e26d2a7b0dd..d36bc663e57 100644 --- a/benchmark/test/reference/preconditioner.reordered.stderr +++ b/benchmark/test/reference/preconditioner.reordered.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 Running with preconditioners: none diff --git a/benchmark/test/reference/preconditioner.simple.stderr b/benchmark/test/reference/preconditioner.simple.stderr index e26d2a7b0dd..d36bc663e57 100644 --- a/benchmark/test/reference/preconditioner.simple.stderr +++ b/benchmark/test/reference/preconditioner.simple.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 Running with preconditioners: none diff --git a/benchmark/test/reference/solver.matrix.stderr b/benchmark/test/reference/solver.matrix.stderr index 8fa38bfb7ed..dddd27e145a 100644 --- a/benchmark/test/reference/solver.matrix.stderr +++ b/benchmark/test/reference/solver.matrix.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr index 70bfe336298..f70cf743888 100644 --- a/benchmark/test/reference/solver.profile.stderr +++ b/benchmark/test/reference/solver.profile.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 diff --git a/benchmark/test/reference/solver.reordered.stderr b/benchmark/test/reference/solver.reordered.stderr index fa61d6c4050..6baa84ee792 100644 --- a/benchmark/test/reference/solver.reordered.stderr +++ b/benchmark/test/reference/solver.reordered.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 diff --git a/benchmark/test/reference/solver.simple.stderr b/benchmark/test/reference/solver.simple.stderr index fa61d6c4050..6baa84ee792 100644 --- a/benchmark/test/reference/solver.simple.stderr +++ b/benchmark/test/reference/solver.simple.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 1 running iterations The random seed for right hand sides is 42 Running cg with 1000 iterations and residual goal of 1.000000e-06 diff --git a/benchmark/test/reference/sparse_blas.matrix.stderr b/benchmark/test/reference/sparse_blas.matrix.stderr index 1702804ddb9..e8f92862042 100644 --- a/benchmark/test/reference/sparse_blas.matrix.stderr +++ b/benchmark/test/reference/sparse_blas.matrix.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The operations are transpose diff --git a/benchmark/test/reference/sparse_blas.profile.stderr b/benchmark/test/reference/sparse_blas.profile.stderr index 23af617c55b..cfb7998cfb6 100644 --- a/benchmark/test/reference/sparse_blas.profile.stderr +++ b/benchmark/test/reference/sparse_blas.profile.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 The operations are transpose diff --git a/benchmark/test/reference/sparse_blas.reordered.stderr b/benchmark/test/reference/sparse_blas.reordered.stderr index c7259f7e4ea..874e6c4a7f1 100644 --- a/benchmark/test/reference/sparse_blas.reordered.stderr +++ b/benchmark/test/reference/sparse_blas.reordered.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The operations are symbolic_cholesky diff --git a/benchmark/test/reference/sparse_blas.simple.stderr b/benchmark/test/reference/sparse_blas.simple.stderr index 0e8b2dfeb66..d4e29cd9cd7 100644 --- a/benchmark/test/reference/sparse_blas.simple.stderr +++ b/benchmark/test/reference/sparse_blas.simple.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The operations are transpose diff --git a/benchmark/test/reference/spmv.matrix.stderr b/benchmark/test/reference/spmv.matrix.stderr index b3d2b47dffb..7896bb14728 100644 --- a/benchmark/test/reference/spmv.matrix.stderr +++ b/benchmark/test/reference/spmv.matrix.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr index 16d23370d16..4861b217d44 100644 --- a/benchmark/test/reference/spmv.profile.stderr +++ b/benchmark/test/reference/spmv.profile.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 The formats are coo diff --git a/benchmark/test/reference/spmv.reordered.stderr b/benchmark/test/reference/spmv.reordered.stderr index 555459ca70c..a1f6a62e866 100644 --- a/benchmark/test/reference/spmv.reordered.stderr +++ b/benchmark/test/reference/spmv.reordered.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo diff --git a/benchmark/test/reference/spmv.simple.stderr b/benchmark/test/reference/spmv.simple.stderr index 555459ca70c..a1f6a62e866 100644 --- a/benchmark/test/reference/spmv.simple.stderr +++ b/benchmark/test/reference/spmv.simple.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are coo diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr index d3a645aa0f7..a671e0d660f 100644 --- a/benchmark/test/reference/spmv_distributed.profile.stderr +++ b/benchmark/test/reference/spmv_distributed.profile.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 0 warm iterations and 1 running iterations The random seed for right hand sides is 42 The formats are [csr]x[csr] diff --git a/benchmark/test/reference/spmv_distributed.simple.stderr b/benchmark/test/reference/spmv_distributed.simple.stderr index fc3409bded7..b3739ed8774 100644 --- a/benchmark/test/reference/spmv_distributed.simple.stderr +++ b/benchmark/test/reference/spmv_distributed.simple.stderr @@ -1,4 +1,4 @@ -Running on reference(0) +Running on ReferenceExecutor Running with 2 warm iterations and 10 running iterations The random seed for right hand sides is 42 The formats are [csr]x[csr] diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index 5ae34fa00ab..58b5410478d 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -168,11 +168,11 @@ void initialize_argument_parsing(int* argc, char** argv[], std::string& header, * * @param extra describes benchmark specific extra parameters to output */ -void print_general_information(const std::string& extra) +void print_general_information(const std::string& extra, + std::shared_ptr exec) { std::clog << gko::version_info::get() << std::endl - << "Running on " << FLAGS_executor << "(" << FLAGS_device_id - << ")\n" + << "Running on " << exec->get_description() << std::endl << "Running with " << FLAGS_warmup << " warm iterations and "; if (FLAGS_repetitions == "auto") { std::clog << "adaptively determined repetititions with " From 818a31258e583eb7ec815dc818241662e9b82eff Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 19 May 2024 01:44:42 +0200 Subject: [PATCH 030/448] use Executor:::get_description() in tests --- test/base/batch_multi_vector_kernels.cpp | 2 +- test/base/device_matrix_data_kernels.cpp | 2 +- test/base/executor.cpp | 1 + test/base/index_range.cpp | 2 +- test/base/kernel_launch_generic.cpp | 2 +- test/base/timer.cpp | 2 +- test/components/absolute_array_kernels.cpp | 2 +- test/components/fill_array_kernels.cpp | 2 +- test/components/format_conversion_kernels.cpp | 2 +- .../precision_conversion_kernels.cpp | 2 +- test/components/prefix_sum_kernels.cpp | 2 +- test/components/reduce_array_kernels.cpp | 2 +- test/distributed/matrix_kernels.cpp | 2 +- test/distributed/partition_helper_kernels.cpp | 2 +- test/distributed/partition_kernels.cpp | 2 +- test/distributed/vector_kernels.cpp | 2 +- test/factorization/cholesky_kernels.cpp | 2 +- test/factorization/ic_kernels.cpp | 2 +- test/factorization/ilu_kernels.cpp | 2 +- test/factorization/lu_kernels.cpp | 2 +- test/factorization/par_ic_kernels.cpp | 2 +- test/factorization/par_ict_kernels.cpp | 2 +- test/factorization/par_ilu_kernels.cpp | 2 +- test/factorization/par_ilut_kernels.cpp | 2 +- test/log/profiler_hook.cpp | 2 +- test/matrix/batch_csr_kernels.cpp | 2 +- test/matrix/batch_dense_kernels.cpp | 2 +- test/matrix/batch_ell_kernels.cpp | 2 +- test/matrix/coo_kernels.cpp | 2 +- test/matrix/csr_kernels.cpp | 2 +- test/matrix/csr_kernels2.cpp | 2 +- test/matrix/dense_kernels.cpp | 2 +- test/matrix/diagonal_kernels.cpp | 2 +- test/matrix/ell_kernels.cpp | 2 +- test/matrix/fbcsr_kernels.cpp | 2 +- test/matrix/fft_kernels.cpp | 2 +- test/matrix/hybrid_kernels.cpp | 2 +- test/matrix/matrix.cpp | 2 +- test/matrix/permutation_kernels.cpp | 2 +- test/matrix/scaled_permutation_kernels.cpp | 2 +- test/matrix/sellp_kernels.cpp | 2 +- test/matrix/sparsity_csr_kernels.cpp | 2 +- test/mpi/matrix.cpp | 2 +- test/mpi/multigrid/pgm.cpp | 2 +- test/mpi/partition_helpers.cpp | 2 +- test/mpi/preconditioner/schwarz.cpp | 2 +- test/mpi/solver/solver.cpp | 2 +- test/mpi/vector.cpp | 2 +- test/multigrid/fixed_coarsening_kernels.cpp | 2 +- test/multigrid/pgm_kernels.cpp | 2 +- test/preconditioner/batch_jacobi_kernels.cpp | 2 +- test/preconditioner/isai_kernels.cpp | 2 +- test/preconditioner/jacobi_kernels.cpp | 2 +- test/reorder/amd.cpp | 2 +- test/reorder/mc64.cpp | 2 +- test/reorder/nested_dissection.cpp | 2 +- test/reorder/rcm.cpp | 2 +- test/solver/batch_bicgstab_kernels.cpp | 2 +- test/solver/batch_cg_kernels.cpp | 2 +- test/solver/bicg_kernels.cpp | 2 +- test/solver/bicgstab_kernels.cpp | 2 +- test/solver/cb_gmres_kernels.cpp | 2 +- test/solver/cg_kernels.cpp | 2 +- test/solver/cgs_kernels.cpp | 2 +- test/solver/direct.cpp | 2 +- test/solver/fcg_kernels.cpp | 2 +- test/solver/gcr_kernels.cpp | 2 +- test/solver/gmres_kernels.cpp | 2 +- test/solver/idr_kernels.cpp | 2 +- test/solver/ir_kernels.cpp | 2 +- test/solver/lower_trs_kernels.cpp | 2 +- test/solver/multigrid_kernels.cpp | 2 +- test/solver/solver.cpp | 2 +- test/solver/upper_trs_kernels.cpp | 2 +- test/stop/combined_kernels.cpp | 2 +- test/stop/criterion_kernels.cpp | 2 +- test/stop/residual_norm_kernels.cpp | 2 +- test/utils/common_fixture.hpp | 82 +++++++++++++++++++ test/utils/executor.hpp | 58 ------------- .../mpi/{executor.hpp => common_fixture.hpp} | 6 +- 80 files changed, 162 insertions(+), 137 deletions(-) create mode 100644 test/utils/common_fixture.hpp rename test/utils/mpi/{executor.hpp => common_fixture.hpp} (90%) diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp index d15e6d2165f..6ce391c92cb 100644 --- a/test/base/batch_multi_vector_kernels.cpp +++ b/test/base/batch_multi_vector_kernels.cpp @@ -16,7 +16,7 @@ #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" #include "core/test/utils/batch_helpers.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class MultiVector : public CommonTestFixture { diff --git a/test/base/device_matrix_data_kernels.cpp b/test/base/device_matrix_data_kernels.cpp index 59c9ec209c3..ffadbcfb245 100644 --- a/test/base/device_matrix_data_kernels.cpp +++ b/test/base/device_matrix_data_kernels.cpp @@ -16,7 +16,7 @@ #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" template diff --git a/test/base/executor.cpp b/test/base/executor.cpp index 3b93d7e748a..8a344eb224d 100644 --- a/test/base/executor.cpp +++ b/test/base/executor.cpp @@ -11,6 +11,7 @@ #include #include "core/test/utils/assertions.hpp" +#include "test/utils/common_fixture.hpp" namespace reference { diff --git a/test/base/index_range.cpp b/test/base/index_range.cpp index 8bb5519c457..0a344a63d9a 100644 --- a/test/base/index_range.cpp +++ b/test/base/index_range.cpp @@ -12,7 +12,7 @@ #include "common/unified/base/kernel_launch.hpp" #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class IndexRange : public CommonTestFixture { diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp index 8107e6a3eef..0d187b07bdf 100644 --- a/test/base/kernel_launch_generic.cpp +++ b/test/base/kernel_launch_generic.cpp @@ -18,7 +18,7 @@ #include "common/unified/base/kernel_launch_solver.hpp" #include "core/base/array_access.hpp" #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" using gko::dim; diff --git a/test/base/timer.cpp b/test/base/timer.cpp index f2f0da113bf..2463f508450 100644 --- a/test/base/timer.cpp +++ b/test/base/timer.cpp @@ -10,7 +10,7 @@ #include #include "core/test/utils/assertions.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Timer : public CommonTestFixture { diff --git a/test/components/absolute_array_kernels.cpp b/test/components/absolute_array_kernels.cpp index a18ab1534c9..3a4a2d787aa 100644 --- a/test/components/absolute_array_kernels.cpp +++ b/test/components/absolute_array_kernels.cpp @@ -13,7 +13,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class AbsoluteArray : public CommonTestFixture { diff --git a/test/components/fill_array_kernels.cpp b/test/components/fill_array_kernels.cpp index 122edb4dc27..4756180f896 100644 --- a/test/components/fill_array_kernels.cpp +++ b/test/components/fill_array_kernels.cpp @@ -13,7 +13,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" template diff --git a/test/components/format_conversion_kernels.cpp b/test/components/format_conversion_kernels.cpp index 3e783206af5..217ecd22600 100644 --- a/test/components/format_conversion_kernels.cpp +++ b/test/components/format_conversion_kernels.cpp @@ -11,7 +11,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" template diff --git a/test/components/precision_conversion_kernels.cpp b/test/components/precision_conversion_kernels.cpp index dcd6a0dba83..9eb26f0a9b8 100644 --- a/test/components/precision_conversion_kernels.cpp +++ b/test/components/precision_conversion_kernels.cpp @@ -13,7 +13,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" #if !(GINKGO_COMMON_SINGLE_MODE) diff --git a/test/components/prefix_sum_kernels.cpp b/test/components/prefix_sum_kernels.cpp index 1ec97b6eadc..4a1c950855a 100644 --- a/test/components/prefix_sum_kernels.cpp +++ b/test/components/prefix_sum_kernels.cpp @@ -15,7 +15,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" template diff --git a/test/components/reduce_array_kernels.cpp b/test/components/reduce_array_kernels.cpp index 35c358099ad..182928412f2 100644 --- a/test/components/reduce_array_kernels.cpp +++ b/test/components/reduce_array_kernels.cpp @@ -13,7 +13,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" template diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp index 3dcede95bfb..ad91d699496 100644 --- a/test/distributed/matrix_kernels.cpp +++ b/test/distributed/matrix_kernels.cpp @@ -13,7 +13,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" using comm_index_type = gko::experimental::distributed::comm_index_type; diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp index 5b014625e7d..2f1c8a2002d 100644 --- a/test/distributed/partition_helper_kernels.cpp +++ b/test/distributed/partition_helper_kernels.cpp @@ -10,7 +10,7 @@ #include "core/base/iterator_factory.hpp" #include "core/distributed/partition_helpers_kernels.hpp" #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" using gko::experimental::distributed::comm_index_type; diff --git a/test/distributed/partition_kernels.cpp b/test/distributed/partition_kernels.cpp index b00d266170c..6634744211d 100644 --- a/test/distributed/partition_kernels.cpp +++ b/test/distributed/partition_kernels.cpp @@ -15,7 +15,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" using comm_index_type = gko::experimental::distributed::comm_index_type; diff --git a/test/distributed/vector_kernels.cpp b/test/distributed/vector_kernels.cpp index 294b72d861e..1246da9a116 100644 --- a/test/distributed/vector_kernels.cpp +++ b/test/distributed/vector_kernels.cpp @@ -15,7 +15,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" using comm_index_type = gko::experimental::distributed::comm_index_type; diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp index b7c290eec17..94d31fe33db 100644 --- a/test/factorization/cholesky_kernels.cpp +++ b/test/factorization/cholesky_kernels.cpp @@ -24,7 +24,7 @@ #include "core/test/utils/assertions.hpp" #include "core/utils/matrix_utils.hpp" #include "matrices/config.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" namespace { diff --git a/test/factorization/ic_kernels.cpp b/test/factorization/ic_kernels.cpp index ddb38575e03..9f0b60443f2 100644 --- a/test/factorization/ic_kernels.cpp +++ b/test/factorization/ic_kernels.cpp @@ -16,7 +16,7 @@ #include "core/test/utils.hpp" #include "core/test/utils/unsort_matrix.hpp" #include "matrices/config.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Ic : public CommonTestFixture { diff --git a/test/factorization/ilu_kernels.cpp b/test/factorization/ilu_kernels.cpp index bc7edeac57f..004b0d34a4f 100644 --- a/test/factorization/ilu_kernels.cpp +++ b/test/factorization/ilu_kernels.cpp @@ -16,7 +16,7 @@ #include "core/test/utils.hpp" #include "core/test/utils/unsort_matrix.hpp" #include "matrices/config.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Ilu : public CommonTestFixture { diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp index 035e938c7c8..830ba6ddd5f 100644 --- a/test/factorization/lu_kernels.cpp +++ b/test/factorization/lu_kernels.cpp @@ -27,7 +27,7 @@ #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" #include "matrices/config.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" template diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp index 64541612343..de2342a28db 100644 --- a/test/factorization/par_ic_kernels.cpp +++ b/test/factorization/par_ic_kernels.cpp @@ -23,7 +23,7 @@ #include "core/matrix/csr_kernels.hpp" #include "core/test/utils.hpp" #include "matrices/config.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" template diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp index b157971ff90..3b33e52630c 100644 --- a/test/factorization/par_ict_kernels.cpp +++ b/test/factorization/par_ict_kernels.cpp @@ -23,7 +23,7 @@ #include "core/matrix/csr_kernels.hpp" #include "core/test/utils.hpp" #include "matrices/config.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" template diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp index a2f3f774ba7..88f5ecff0d9 100644 --- a/test/factorization/par_ilu_kernels.cpp +++ b/test/factorization/par_ilu_kernels.cpp @@ -22,7 +22,7 @@ #include "core/factorization/factorization_kernels.hpp" #include "core/test/utils.hpp" #include "matrices/config.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" template diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp index 6426e725fdf..dff3cc702c1 100644 --- a/test/factorization/par_ilut_kernels.cpp +++ b/test/factorization/par_ilut_kernels.cpp @@ -23,7 +23,7 @@ #include "core/matrix/csr_kernels.hpp" #include "core/test/utils.hpp" #include "matrices/config.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" template diff --git a/test/log/profiler_hook.cpp b/test/log/profiler_hook.cpp index 6e0ed2933db..414477b996a 100644 --- a/test/log/profiler_hook.cpp +++ b/test/log/profiler_hook.cpp @@ -8,7 +8,7 @@ #include -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class ProfilerHook : public CommonTestFixture { diff --git a/test/matrix/batch_csr_kernels.cpp b/test/matrix/batch_csr_kernels.cpp index d2a1b2d9aa4..d466885d056 100644 --- a/test/matrix/batch_csr_kernels.cpp +++ b/test/matrix/batch_csr_kernels.cpp @@ -19,7 +19,7 @@ #include "core/test/utils/array_generator.hpp" #include "core/test/utils/assertions.hpp" #include "core/test/utils/batch_helpers.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Csr : public CommonTestFixture { diff --git a/test/matrix/batch_dense_kernels.cpp b/test/matrix/batch_dense_kernels.cpp index 222ccf6e4b9..17f27b8afa8 100644 --- a/test/matrix/batch_dense_kernels.cpp +++ b/test/matrix/batch_dense_kernels.cpp @@ -19,7 +19,7 @@ #include "core/test/utils/array_generator.hpp" #include "core/test/utils/assertions.hpp" #include "core/test/utils/batch_helpers.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Dense : public CommonTestFixture { diff --git a/test/matrix/batch_ell_kernels.cpp b/test/matrix/batch_ell_kernels.cpp index 7edef2c4fb0..77c4dae13b0 100644 --- a/test/matrix/batch_ell_kernels.cpp +++ b/test/matrix/batch_ell_kernels.cpp @@ -18,7 +18,7 @@ #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" #include "core/test/utils/batch_helpers.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Ell : public CommonTestFixture { diff --git a/test/matrix/coo_kernels.cpp b/test/matrix/coo_kernels.cpp index 3da488cf843..091f95544e6 100644 --- a/test/matrix/coo_kernels.cpp +++ b/test/matrix/coo_kernels.cpp @@ -18,7 +18,7 @@ #include "core/test/utils.hpp" #include "core/test/utils/unsort_matrix.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Coo : public CommonTestFixture { diff --git a/test/matrix/csr_kernels.cpp b/test/matrix/csr_kernels.cpp index 1a1f100e1fd..ec726d856c8 100644 --- a/test/matrix/csr_kernels.cpp +++ b/test/matrix/csr_kernels.cpp @@ -18,7 +18,7 @@ #include "core/base/array_access.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Csr : public CommonTestFixture { diff --git a/test/matrix/csr_kernels2.cpp b/test/matrix/csr_kernels2.cpp index 9272e99546e..9b3f09a13fc 100644 --- a/test/matrix/csr_kernels2.cpp +++ b/test/matrix/csr_kernels2.cpp @@ -27,7 +27,7 @@ #include "core/test/utils/assertions.hpp" #include "core/test/utils/unsort_matrix.hpp" #include "core/utils/matrix_utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Csr : public CommonTestFixture { diff --git a/test/matrix/dense_kernels.cpp b/test/matrix/dense_kernels.cpp index b8fd4d7900c..76e6487aa89 100644 --- a/test/matrix/dense_kernels.cpp +++ b/test/matrix/dense_kernels.cpp @@ -26,7 +26,7 @@ #include "core/components/fill_array_kernels.hpp" #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Dense : public CommonTestFixture { diff --git a/test/matrix/diagonal_kernels.cpp b/test/matrix/diagonal_kernels.cpp index ca0a9eff205..3d2f505a19f 100644 --- a/test/matrix/diagonal_kernels.cpp +++ b/test/matrix/diagonal_kernels.cpp @@ -15,7 +15,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Diagonal : public CommonTestFixture { diff --git a/test/matrix/ell_kernels.cpp b/test/matrix/ell_kernels.cpp index 78af81ccafc..9900caa10c8 100644 --- a/test/matrix/ell_kernels.cpp +++ b/test/matrix/ell_kernels.cpp @@ -17,7 +17,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Ell : public CommonTestFixture { diff --git a/test/matrix/fbcsr_kernels.cpp b/test/matrix/fbcsr_kernels.cpp index a3b85143bf0..8cff04c28a0 100644 --- a/test/matrix/fbcsr_kernels.cpp +++ b/test/matrix/fbcsr_kernels.cpp @@ -14,7 +14,7 @@ #include "core/test/matrix/fbcsr_sample.hpp" #include "core/test/utils.hpp" #include "core/test/utils/fb_matrix_generator.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" template diff --git a/test/matrix/fft_kernels.cpp b/test/matrix/fft_kernels.cpp index 056087fb9f3..5b2c33085e3 100644 --- a/test/matrix/fft_kernels.cpp +++ b/test/matrix/fft_kernels.cpp @@ -13,7 +13,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" template diff --git a/test/matrix/hybrid_kernels.cpp b/test/matrix/hybrid_kernels.cpp index 64179259deb..7028a14bd96 100644 --- a/test/matrix/hybrid_kernels.cpp +++ b/test/matrix/hybrid_kernels.cpp @@ -15,7 +15,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Hybrid : public CommonTestFixture { diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp index 7398b3edb06..eea1a67ef5f 100644 --- a/test/matrix/matrix.cpp +++ b/test/matrix/matrix.cpp @@ -22,7 +22,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" #if GINKGO_COMMON_SINGLE_MODE diff --git a/test/matrix/permutation_kernels.cpp b/test/matrix/permutation_kernels.cpp index e6324c15f1d..3e2a97c02bd 100644 --- a/test/matrix/permutation_kernels.cpp +++ b/test/matrix/permutation_kernels.cpp @@ -11,7 +11,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Permutation : public CommonTestFixture { diff --git a/test/matrix/scaled_permutation_kernels.cpp b/test/matrix/scaled_permutation_kernels.cpp index 7239862a8d9..545c7fd064d 100644 --- a/test/matrix/scaled_permutation_kernels.cpp +++ b/test/matrix/scaled_permutation_kernels.cpp @@ -10,7 +10,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class ScaledPermutation : public CommonTestFixture { diff --git a/test/matrix/sellp_kernels.cpp b/test/matrix/sellp_kernels.cpp index 053369f7fa6..549277b40a3 100644 --- a/test/matrix/sellp_kernels.cpp +++ b/test/matrix/sellp_kernels.cpp @@ -16,7 +16,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Sellp : public CommonTestFixture { diff --git a/test/matrix/sparsity_csr_kernels.cpp b/test/matrix/sparsity_csr_kernels.cpp index 8d3728f240d..75af81874e3 100644 --- a/test/matrix/sparsity_csr_kernels.cpp +++ b/test/matrix/sparsity_csr_kernels.cpp @@ -19,7 +19,7 @@ #include "core/test/utils/assertions.hpp" #include "core/test/utils/matrix_generator.hpp" #include "core/test/utils/unsort_matrix.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" namespace { diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index 8a201c78733..cc9ec219a88 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -20,7 +20,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/mpi/executor.hpp" +#include "test/utils/mpi/common_fixture.hpp" #ifndef GKO_COMPILING_DPCPP diff --git a/test/mpi/multigrid/pgm.cpp b/test/mpi/multigrid/pgm.cpp index ccd7dd46b44..664ad0cd4ec 100644 --- a/test/mpi/multigrid/pgm.cpp +++ b/test/mpi/multigrid/pgm.cpp @@ -19,7 +19,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/mpi/executor.hpp" +#include "test/utils/mpi/common_fixture.hpp" #if GINKGO_DPCPP_SINGLE_MODE diff --git a/test/mpi/partition_helpers.cpp b/test/mpi/partition_helpers.cpp index b89295acc13..43b4783d896 100644 --- a/test/mpi/partition_helpers.cpp +++ b/test/mpi/partition_helpers.cpp @@ -6,7 +6,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/mpi/executor.hpp" +#include "test/utils/mpi/common_fixture.hpp" using comm_index_type = gko::experimental::distributed::comm_index_type; diff --git a/test/mpi/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp index cf29ea77016..6717cd9d888 100644 --- a/test/mpi/preconditioner/schwarz.cpp +++ b/test/mpi/preconditioner/schwarz.cpp @@ -29,7 +29,7 @@ #include "core/test/utils.hpp" #include "core/test/utils/matrix_generator.hpp" #include "core/utils/matrix_utils.hpp" -#include "test/utils/mpi/executor.hpp" +#include "test/utils/mpi/common_fixture.hpp" #if GINKGO_DPCPP_SINGLE_MODE diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp index 4548dc5d6b7..589be91bcba 100644 --- a/test/mpi/solver/solver.cpp +++ b/test/mpi/solver/solver.cpp @@ -31,7 +31,7 @@ #include "core/test/utils.hpp" #include "core/test/utils/matrix_generator.hpp" #include "core/utils/matrix_utils.hpp" -#include "test/utils/mpi/executor.hpp" +#include "test/utils/mpi/common_fixture.hpp" #if GINKGO_DPCPP_SINGLE_MODE diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp index 3af6886dd84..cedd483b0a2 100644 --- a/test/mpi/vector.cpp +++ b/test/mpi/vector.cpp @@ -17,7 +17,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/mpi/executor.hpp" +#include "test/utils/mpi/common_fixture.hpp" bool needs_transfers(std::shared_ptr exec) diff --git a/test/multigrid/fixed_coarsening_kernels.cpp b/test/multigrid/fixed_coarsening_kernels.cpp index 0f3c7e56b2a..91c1e021a76 100644 --- a/test/multigrid/fixed_coarsening_kernels.cpp +++ b/test/multigrid/fixed_coarsening_kernels.cpp @@ -26,7 +26,7 @@ #include "core/test/utils/matrix_generator.hpp" #include "core/test/utils/unsort_matrix.hpp" #include "core/utils/matrix_utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class FixedCoarsening : public CommonTestFixture { diff --git a/test/multigrid/pgm_kernels.cpp b/test/multigrid/pgm_kernels.cpp index b0e3b338cbd..cdbfb5295f2 100644 --- a/test/multigrid/pgm_kernels.cpp +++ b/test/multigrid/pgm_kernels.cpp @@ -25,7 +25,7 @@ #include "core/test/utils/matrix_generator.hpp" #include "core/test/utils/unsort_matrix.hpp" #include "core/utils/matrix_utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Pgm : public CommonTestFixture { diff --git a/test/preconditioner/batch_jacobi_kernels.cpp b/test/preconditioner/batch_jacobi_kernels.cpp index 9bdbb015949..62e309361c9 100644 --- a/test/preconditioner/batch_jacobi_kernels.cpp +++ b/test/preconditioner/batch_jacobi_kernels.cpp @@ -22,7 +22,7 @@ #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" #include "core/test/utils/batch_helpers.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" namespace detail { diff --git a/test/preconditioner/isai_kernels.cpp b/test/preconditioner/isai_kernels.cpp index 077379ab226..8ac1ad1e8ba 100644 --- a/test/preconditioner/isai_kernels.cpp +++ b/test/preconditioner/isai_kernels.cpp @@ -18,7 +18,7 @@ #include "core/test/utils.hpp" #include "matrices/config.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" enum struct matrix_type { lower, upper, general, spd }; diff --git a/test/preconditioner/jacobi_kernels.cpp b/test/preconditioner/jacobi_kernels.cpp index 5ae7c56e715..23347d8d896 100644 --- a/test/preconditioner/jacobi_kernels.cpp +++ b/test/preconditioner/jacobi_kernels.cpp @@ -13,7 +13,7 @@ #include "core/test/utils.hpp" #include "core/test/utils/unsort_matrix.hpp" #include "core/utils/matrix_utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Jacobi : public CommonTestFixture { diff --git a/test/reorder/amd.cpp b/test/reorder/amd.cpp index 8137ed8ad7e..a1ca7c09359 100644 --- a/test/reorder/amd.cpp +++ b/test/reorder/amd.cpp @@ -15,7 +15,7 @@ #include "core/test/utils.hpp" #include "core/test/utils/unsort_matrix.hpp" #include "matrices/config.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" template diff --git a/test/reorder/mc64.cpp b/test/reorder/mc64.cpp index 0cc3ea33a3d..f05b13d19c0 100644 --- a/test/reorder/mc64.cpp +++ b/test/reorder/mc64.cpp @@ -8,7 +8,7 @@ #include #include "core/test/utils/assertions.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" namespace { diff --git a/test/reorder/nested_dissection.cpp b/test/reorder/nested_dissection.cpp index 2d11bdccb12..d35818f28e6 100644 --- a/test/reorder/nested_dissection.cpp +++ b/test/reorder/nested_dissection.cpp @@ -11,7 +11,7 @@ #include "core/test/utils.hpp" #include "matrices/config.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" template diff --git a/test/reorder/rcm.cpp b/test/reorder/rcm.cpp index 848d0deea5d..9ae656fbc1c 100644 --- a/test/reorder/rcm.cpp +++ b/test/reorder/rcm.cpp @@ -19,7 +19,7 @@ #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" #include "matrices/config.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Rcm : public CommonTestFixture { diff --git a/test/solver/batch_bicgstab_kernels.cpp b/test/solver/batch_bicgstab_kernels.cpp index 8f4bfca00cc..1a852eacfe9 100644 --- a/test/solver/batch_bicgstab_kernels.cpp +++ b/test/solver/batch_bicgstab_kernels.cpp @@ -20,7 +20,7 @@ #include "core/matrix/batch_dense_kernels.hpp" #include "core/test/utils.hpp" #include "core/test/utils/batch_helpers.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class BatchBicgstab : public CommonTestFixture { diff --git a/test/solver/batch_cg_kernels.cpp b/test/solver/batch_cg_kernels.cpp index 7b5a85a1e5b..4c6de9004c9 100644 --- a/test/solver/batch_cg_kernels.cpp +++ b/test/solver/batch_cg_kernels.cpp @@ -19,7 +19,7 @@ #include "core/matrix/batch_dense_kernels.hpp" #include "core/test/utils.hpp" #include "core/test/utils/batch_helpers.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class BatchCg : public CommonTestFixture { diff --git a/test/solver/bicg_kernels.cpp b/test/solver/bicg_kernels.cpp index 5f9dd818711..3f3b6a01ae1 100644 --- a/test/solver/bicg_kernels.cpp +++ b/test/solver/bicg_kernels.cpp @@ -19,7 +19,7 @@ #include "core/test/utils.hpp" #include "core/utils/matrix_utils.hpp" #include "matrices/config.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Bicg : public CommonTestFixture { diff --git a/test/solver/bicgstab_kernels.cpp b/test/solver/bicgstab_kernels.cpp index 9548c99daf9..a90451a3f3a 100644 --- a/test/solver/bicgstab_kernels.cpp +++ b/test/solver/bicgstab_kernels.cpp @@ -19,7 +19,7 @@ #include "core/test/utils.hpp" #include "core/utils/matrix_utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Bicgstab : public CommonTestFixture { diff --git a/test/solver/cb_gmres_kernels.cpp b/test/solver/cb_gmres_kernels.cpp index 45a752a2292..022899d21e6 100644 --- a/test/solver/cb_gmres_kernels.cpp +++ b/test/solver/cb_gmres_kernels.cpp @@ -20,7 +20,7 @@ #include "core/solver/cb_gmres_accessor.hpp" #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class CbGmres : public CommonTestFixture { diff --git a/test/solver/cg_kernels.cpp b/test/solver/cg_kernels.cpp index b4408851da6..13e6905fa81 100644 --- a/test/solver/cg_kernels.cpp +++ b/test/solver/cg_kernels.cpp @@ -18,7 +18,7 @@ #include "core/test/utils.hpp" #include "core/utils/matrix_utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Cg : public CommonTestFixture { diff --git a/test/solver/cgs_kernels.cpp b/test/solver/cgs_kernels.cpp index 392167d2106..f952e68170e 100644 --- a/test/solver/cgs_kernels.cpp +++ b/test/solver/cgs_kernels.cpp @@ -18,7 +18,7 @@ #include "core/test/utils.hpp" #include "core/utils/matrix_utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Cgs : public CommonTestFixture { diff --git a/test/solver/direct.cpp b/test/solver/direct.cpp index c2e6c757f76..a58d3d46f3f 100644 --- a/test/solver/direct.cpp +++ b/test/solver/direct.cpp @@ -23,7 +23,7 @@ #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" #include "matrices/config.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" namespace { diff --git a/test/solver/fcg_kernels.cpp b/test/solver/fcg_kernels.cpp index 9ad2be9eb05..194151f203e 100644 --- a/test/solver/fcg_kernels.cpp +++ b/test/solver/fcg_kernels.cpp @@ -18,7 +18,7 @@ #include "core/test/utils.hpp" #include "core/utils/matrix_utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Fcg : public CommonTestFixture { diff --git a/test/solver/gcr_kernels.cpp b/test/solver/gcr_kernels.cpp index d26b5ef265c..5a46bbbb940 100644 --- a/test/solver/gcr_kernels.cpp +++ b/test/solver/gcr_kernels.cpp @@ -19,7 +19,7 @@ #include "core/test/utils.hpp" #include "core/utils/matrix_utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Gcr : public CommonTestFixture { diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp index 52ee885e29d..a6c74bd45c0 100644 --- a/test/solver/gmres_kernels.cpp +++ b/test/solver/gmres_kernels.cpp @@ -19,7 +19,7 @@ #include "core/solver/common_gmres_kernels.hpp" #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Gmres : public CommonTestFixture { diff --git a/test/solver/idr_kernels.cpp b/test/solver/idr_kernels.cpp index 7afac1c2f33..a9857952615 100644 --- a/test/solver/idr_kernels.cpp +++ b/test/solver/idr_kernels.cpp @@ -27,7 +27,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" // use another alias to avoid conflict name in the Idr diff --git a/test/solver/ir_kernels.cpp b/test/solver/ir_kernels.cpp index 114dee3c06b..31973e849b1 100644 --- a/test/solver/ir_kernels.cpp +++ b/test/solver/ir_kernels.cpp @@ -17,7 +17,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Ir : public CommonTestFixture { diff --git a/test/solver/lower_trs_kernels.cpp b/test/solver/lower_trs_kernels.cpp index 4bccf283faf..b838c1df14b 100644 --- a/test/solver/lower_trs_kernels.cpp +++ b/test/solver/lower_trs_kernels.cpp @@ -15,7 +15,7 @@ #include "core/test/utils.hpp" #include "core/utils/matrix_utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class LowerTrs : public CommonTestFixture { diff --git a/test/solver/multigrid_kernels.cpp b/test/solver/multigrid_kernels.cpp index 894f4280346..2efb7cf8158 100644 --- a/test/solver/multigrid_kernels.cpp +++ b/test/solver/multigrid_kernels.cpp @@ -16,7 +16,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class Multigrid : public CommonTestFixture { diff --git a/test/solver/solver.cpp b/test/solver/solver.cpp index 5b24234ce14..47414f83041 100644 --- a/test/solver/solver.cpp +++ b/test/solver/solver.cpp @@ -31,7 +31,7 @@ #include "core/test/utils.hpp" #include "core/utils/matrix_utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" #if GINKGO_COMMON_SINGLE_MODE diff --git a/test/solver/upper_trs_kernels.cpp b/test/solver/upper_trs_kernels.cpp index c7041865dd1..6825d9f6c3b 100644 --- a/test/solver/upper_trs_kernels.cpp +++ b/test/solver/upper_trs_kernels.cpp @@ -15,7 +15,7 @@ #include "core/test/utils.hpp" #include "core/utils/matrix_utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class UpperTrs : public CommonTestFixture { diff --git a/test/stop/combined_kernels.cpp b/test/stop/combined_kernels.cpp index 7e18a0c32aa..96cf8656c33 100644 --- a/test/stop/combined_kernels.cpp +++ b/test/stop/combined_kernels.cpp @@ -7,7 +7,7 @@ #include #include -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" constexpr gko::size_type test_iterations = 10; diff --git a/test/stop/criterion_kernels.cpp b/test/stop/criterion_kernels.cpp index 6b6094125ba..30280e848d8 100644 --- a/test/stop/criterion_kernels.cpp +++ b/test/stop/criterion_kernels.cpp @@ -7,7 +7,7 @@ #include #include -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" constexpr gko::size_type test_iterations = 10; diff --git a/test/stop/residual_norm_kernels.cpp b/test/stop/residual_norm_kernels.cpp index 7c3ddf6624e..a0a144bcf3b 100644 --- a/test/stop/residual_norm_kernels.cpp +++ b/test/stop/residual_norm_kernels.cpp @@ -8,7 +8,7 @@ #include #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" template diff --git a/test/utils/common_fixture.hpp b/test/utils/common_fixture.hpp new file mode 100644 index 00000000000..7d4883470e7 --- /dev/null +++ b/test/utils/common_fixture.hpp @@ -0,0 +1,82 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_TEST_UTILS_COMMON_FIXTURE_HPP_ +#define GKO_TEST_UTILS_COMMON_FIXTURE_HPP_ + + +#include +#include + + +#include + + +#include +#include + + +#include "core/test/gtest/resources.hpp" +#include "test/utils/executor.hpp" + + +#if GINKGO_COMMON_SINGLE_MODE +#define SKIP_IF_SINGLE_MODE GTEST_SKIP() << "Skip due to single mode" +#else +#define SKIP_IF_SINGLE_MODE \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") +#endif + + +class CommonTestFixture : public ::testing::Test { +public: +#if GINKGO_COMMON_SINGLE_MODE + using value_type = float; +#else + using value_type = double; +#endif + using index_type = int; + + CommonTestFixture() + : +#if defined(GKO_TEST_NONDEFAULT_STREAM) && defined(GKO_COMPILING_CUDA) + stream(ResourceEnvironment::cuda_device_id), +#endif +#if defined(GKO_TEST_NONDEFAULT_STREAM) && defined(GKO_COMPILING_HIP) + stream(ResourceEnvironment::hip_device_id), +#endif + ref{gko::ReferenceExecutor::create()} + { +#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) + init_executor(ref, exec, stream.get()); +#else + init_executor(ref, exec); +#endif + // set device-id test-wide since some test call device + // kernels directly + guard = exec->get_scoped_device_id_guard(); + } + + void TearDown() final + { + if (exec != nullptr) { + ASSERT_NO_THROW(exec->synchronize()); + } + } + +#ifdef GKO_COMPILING_CUDA + gko::cuda_stream stream; +#endif +#ifdef GKO_COMPILING_HIP + gko::hip_stream stream; +#endif + std::shared_ptr ref; + std::shared_ptr exec; + gko::scoped_device_id_guard guard; +}; + + +#endif // GKO_TEST_UTILS_COMMON_FIXTURE_HPP_ diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index b31d1242f35..9c63d514cb4 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -17,16 +17,6 @@ #include "core/test/gtest/resources.hpp" -#if GINKGO_COMMON_SINGLE_MODE -#define SKIP_IF_SINGLE_MODE GTEST_SKIP() << "Skip due to single mode" -#else -#define SKIP_IF_SINGLE_MODE \ - static_assert(true, \ - "This assert is used to counter the false positive extra " \ - "semi-colon warnings") -#endif - - inline void init_executor(std::shared_ptr, std::shared_ptr& exec) { @@ -83,52 +73,4 @@ inline void init_executor(std::shared_ptr ref, } -class CommonTestFixture : public ::testing::Test { -public: -#if GINKGO_COMMON_SINGLE_MODE - using value_type = float; -#else - using value_type = double; -#endif - using index_type = int; - - CommonTestFixture() - : -#if defined(GKO_TEST_NONDEFAULT_STREAM) && defined(GKO_COMPILING_CUDA) - stream(ResourceEnvironment::cuda_device_id), -#endif -#if defined(GKO_TEST_NONDEFAULT_STREAM) && defined(GKO_COMPILING_HIP) - stream(ResourceEnvironment::hip_device_id), -#endif - ref{gko::ReferenceExecutor::create()} - { -#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) - init_executor(ref, exec, stream.get()); -#else - init_executor(ref, exec); -#endif - // set device-id test-wide since some test call device - // kernels directly - guard = exec->get_scoped_device_id_guard(); - } - - void TearDown() final - { - if (exec != nullptr) { - ASSERT_NO_THROW(exec->synchronize()); - } - } - -#ifdef GKO_COMPILING_CUDA - gko::cuda_stream stream; -#endif -#ifdef GKO_COMPILING_HIP - gko::hip_stream stream; -#endif - std::shared_ptr ref; - std::shared_ptr exec; - gko::scoped_device_id_guard guard; -}; - - #endif // GKO_TEST_UTILS_EXECUTOR_HPP_ diff --git a/test/utils/mpi/executor.hpp b/test/utils/mpi/common_fixture.hpp similarity index 90% rename from test/utils/mpi/executor.hpp rename to test/utils/mpi/common_fixture.hpp index 199de02c054..67936bba2b6 100644 --- a/test/utils/mpi/executor.hpp +++ b/test/utils/mpi/common_fixture.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_TEST_UTILS_MPI_EXECUTOR_HPP_ -#define GKO_TEST_UTILS_MPI_EXECUTOR_HPP_ +#ifndef GKO_TEST_UTILS_MPI_COMMON_FIXTURE_HPP_ +#define GKO_TEST_UTILS_MPI_COMMON_FIXTURE_HPP_ #include @@ -64,4 +64,4 @@ class CommonMpiTestFixture : public ::testing::Test { }; -#endif // GKO_TEST_UTILS_MPI_EXECUTOR_HPP_ +#endif // GKO_TEST_UTILS_MPI_COMMON_FIXTURE_HPP_ From 55ad08b3ad99371474ce0c50a7a9e1617cb93808 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 28 Jun 2024 13:01:31 +0200 Subject: [PATCH 031/448] fix format --- test/utils/common_fixture.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/test/utils/common_fixture.hpp b/test/utils/common_fixture.hpp index 7d4883470e7..55107bc88e9 100644 --- a/test/utils/common_fixture.hpp +++ b/test/utils/common_fixture.hpp @@ -9,14 +9,11 @@ #include #include - #include - #include #include - #include "core/test/gtest/resources.hpp" #include "test/utils/executor.hpp" From 2a55ebc065bb9780117d29ae07c00ea662f28d43 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 28 Jun 2024 15:19:28 +0200 Subject: [PATCH 032/448] fix include --- test/distributed/index_map_kernels.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/distributed/index_map_kernels.cpp b/test/distributed/index_map_kernels.cpp index 718fe84ce92..4fb6f111123 100644 --- a/test/distributed/index_map_kernels.cpp +++ b/test/distributed/index_map_kernels.cpp @@ -18,7 +18,7 @@ #include "core/distributed/partition_kernels.hpp" #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" using comm_index_type = gko::experimental::distributed::comm_index_type; From 5d717e26dcaa95a73f4598de6f4f14778c765680 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Wed, 12 Jun 2024 09:49:12 +0200 Subject: [PATCH 033/448] fixup! mean computation --- common/unified/matrix/dense_kernels.template.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp index f5b3cc03059..155efc94db0 100644 --- a/common/unified/matrix/dense_kernels.template.cpp +++ b/common/unified/matrix/dense_kernels.template.cpp @@ -261,7 +261,7 @@ void compute_mean(std::shared_ptr exec, return x(i, j) * inv_total_size; }, GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(), - tmp, x, ValueType_nc{1.} / x->get_size()[0]); + tmp, x, ValueType_nc{1.} / std::max(1ul, x->get_size()[0])); } From cff75df8f7821484354fd32e95d14f7a8caa92ef Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Mon, 3 Jun 2024 13:50:02 +0200 Subject: [PATCH 034/448] add results rows checks + early return --- reference/matrix/dense_kernels.cpp | 4 ++++ reference/test/matrix/dense_kernels.cpp | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp index 53773a131fe..06bea588d50 100644 --- a/reference/matrix/dense_kernels.cpp +++ b/reference/matrix/dense_kernels.cpp @@ -371,11 +371,15 @@ void compute_mean(std::shared_ptr exec, const matrix::Dense* x, matrix::Dense* result, array&) { + GKO_ASSERT_EQ(result->get_size()[0], 1); + using ValueType_nc = gko::remove_complex; for (size_type j = 0; j < x->get_size()[1]; ++j) { result->at(0, j) = zero(); } + if (x->get_size()[0] == 0) return; + for (size_type i = 0; i < x->get_size()[1]; ++i) { for (size_type j = 0; j < x->get_size()[0]; ++j) { result->at(0, i) += x->at(j, i); diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index 41294c89d49..e7f95abf4f2 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -691,6 +691,14 @@ TYPED_TEST(Dense, ComputesMean) GKO_EXPECT_NEAR(result->at(0, 2), T{1.0}, r::value * 10); } +TYPED_TEST(Dense, ComputesMeanFailsOnZeroRowResults) +{ + using Mtx = typename TestFixture::Mtx; + using T = typename TestFixture::value_type; + auto result = Mtx::create(this->exec, gko::dim<2>{0, 1}); + + ASSERT_THROW(this->mtx4->compute_mean(result), gko::ValueMismatch); +} TYPED_TEST(Dense, ComputesMeanFailsOnWrongResultSize) { From 2bbc14258133691c3c35ac52e4ca9ace22b281d1 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Thu, 27 Jun 2024 09:12:06 +0200 Subject: [PATCH 035/448] Update common/unified/matrix/dense_kernels.template.cpp Co-authored-by: Tobias Ribizel --- common/unified/matrix/dense_kernels.template.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp index 155efc94db0..f469bd997aa 100644 --- a/common/unified/matrix/dense_kernels.template.cpp +++ b/common/unified/matrix/dense_kernels.template.cpp @@ -261,7 +261,7 @@ void compute_mean(std::shared_ptr exec, return x(i, j) * inv_total_size; }, GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(), - tmp, x, ValueType_nc{1.} / std::max(1ul, x->get_size()[0])); + tmp, x, ValueType_nc{1.} / std::max(1, x->get_size()[0])); } From 6aa2fe519dde843cbd93c40d9a7f12b81df6af2c Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Thu, 27 Jun 2024 09:58:45 +0200 Subject: [PATCH 036/448] Apply review comments Co-authored-by: Tobias Ribizel Co-authored-by: Marcel Koch --- reference/matrix/dense_kernels.cpp | 2 -- reference/test/matrix/dense_kernels.cpp | 8 -------- 2 files changed, 10 deletions(-) diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp index 06bea588d50..40c3c40a3ae 100644 --- a/reference/matrix/dense_kernels.cpp +++ b/reference/matrix/dense_kernels.cpp @@ -371,8 +371,6 @@ void compute_mean(std::shared_ptr exec, const matrix::Dense* x, matrix::Dense* result, array&) { - GKO_ASSERT_EQ(result->get_size()[0], 1); - using ValueType_nc = gko::remove_complex; for (size_type j = 0; j < x->get_size()[1]; ++j) { result->at(0, j) = zero(); diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index e7f95abf4f2..41294c89d49 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -691,14 +691,6 @@ TYPED_TEST(Dense, ComputesMean) GKO_EXPECT_NEAR(result->at(0, 2), T{1.0}, r::value * 10); } -TYPED_TEST(Dense, ComputesMeanFailsOnZeroRowResults) -{ - using Mtx = typename TestFixture::Mtx; - using T = typename TestFixture::value_type; - auto result = Mtx::create(this->exec, gko::dim<2>{0, 1}); - - ASSERT_THROW(this->mtx4->compute_mean(result), gko::ValueMismatch); -} TYPED_TEST(Dense, ComputesMeanFailsOnWrongResultSize) { From c123083b51e271ee3555494c9335d2e27e3183c3 Mon Sep 17 00:00:00 2001 From: Gregor Olenik Date: Tue, 2 Jul 2024 14:18:52 +0200 Subject: [PATCH 037/448] update citation file Co-authored-by: Terry Cojean Co-authored-by: Yu-Hsiang M. Tsai --- CITATION.cff | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index d3efc13e771..34accbe7c71 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -26,11 +26,12 @@ authors: - family-names: "Riemer" given-names: "Lukas" - family-names: "Tsai" - given-names: "Yuhsiang" + given-names: "Yu-Hsiang" title: "Ginkgo: A Modern Linear Operator Algebra Framework for High Performance Computing" -version: 1.5.0 -date-released: 2022-11-12 +version: 1.8.0 +date-released: 2024-06-13 url: "https://github.com/ginkgo-project/ginkgo" +license: BSD-3-Clause preferred-citation: type: article authors: From 6a3283519b2c967683eb09a83d39e58cf5685b27 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 29 Jun 2024 14:35:37 +0200 Subject: [PATCH 038/448] run script --- ...hpp.inc => batch_multi_vector_kernels.cpp} | 52 ++ ...hpp.inc => device_matrix_data_kernels.cpp} | 26 + ...ernel_launch.hpp.inc => kernel_launch.hpp} | 51 ++ ...on.hpp.inc => kernel_launch_reduction.hpp} | 23 + ...olver.hpp.inc => kernel_launch_solver.hpp} | 19 + .../components/{atomic.hpp.inc => atomic.hpp} | 23 + ...pp.inc => diagonal_block_manipulation.hpp} | 26 + .../{intrinsics.hpp.inc => intrinsics.hpp} | 20 + .../{merging.hpp.inc => merging.hpp} | 23 + .../{prefix_sum.hpp.inc => prefix_sum.hpp} | 25 + ...kernels.hpp.inc => prefix_sum_kernels.cpp} | 25 + .../{reduction.hpp.inc => reduction.hpp} | 75 +++ .../{searching.hpp.inc => searching.hpp} | 21 + ...{segment_scan.hpp.inc => segment_scan.hpp} | 20 + .../{sorting.hpp.inc => sorting.hpp} | 21 + .../{syncfree.hpp.inc => syncfree.hpp} | 26 + .../{thread_ids.hpp.inc => thread_ids.hpp} | 22 + .../{warp_blas.hpp.inc => warp_blas.hpp} | 26 + ...rix_kernels.hpp.inc => matrix_kernels.cpp} | 32 ++ ....hpp.inc => partition_helpers_kernels.cpp} | 22 + ..._kernels.hpp.inc => partition_kernels.cpp} | 26 + ...tor_kernels.hpp.inc => vector_kernels.cpp} | 26 + ...y_kernels.hpp.inc => cholesky_kernels.cpp} | 106 ++++ ...nels.hpp.inc => factorization_kernels.cpp} | 36 ++ .../{lu_kernels.hpp.inc => lu_kernels.cpp} | 41 ++ ..._ic_kernels.hpp.inc => par_ic_kernels.cpp} | 32 ++ ...lu_kernels.hpp.inc => par_ilu_kernels.cpp} | 31 ++ ...{batch_logger.hpp.inc => batch_logger.hpp} | 22 + ..._kernels.hpp.inc => batch_csr_kernels.cpp} | 51 ++ ...ernels.hpp.inc => batch_dense_kernels.cpp} | 52 ++ ..._kernels.hpp.inc => batch_ell_kernels.cpp} | 51 ++ .../{coo_kernels.hpp.inc => coo_kernels.cpp} | 42 ++ ...ense_kernels.hpp.inc => dense_kernels.cpp} | 225 ++++++++ ...l_kernels.hpp.inc => diagonal_kernels.cpp} | 32 ++ .../cuda_hip/matrix/ell_kernels.cpp | 148 ++++- common/cuda_hip/matrix/ell_kernels.hpp.inc | 133 ----- ...bcsr_kernels.hpp.inc => fbcsr_kernels.cpp} | 295 ++++++++++ ...ellp_kernels.hpp.inc => sellp_kernels.cpp} | 37 ++ .../cuda_hip/matrix/sparsity_csr_kernels.cpp | 131 ++++- .../matrix/sparsity_csr_kernels.hpp.inc | 111 ---- .../{pgm_kernels.hpp.inc => pgm_kernels.cpp} | 34 ++ ...{isai_kernels.hpp.inc => isai_kernels.cpp} | 42 ++ ...obi_kernels.hpp.inc => jacobi_kernels.cpp} | 44 ++ .../{rcm_kernels.hpp.inc => rcm_kernels.cpp} | 46 ++ ...s_kernels.hpp.inc => cb_gmres_kernels.cpp} | 499 +++++++++++++++++ .../cuda_hip/solver/idr_kernels.cpp | 329 +++++++++++- common/cuda_hip/solver/idr_kernels.hpp.inc | 318 ----------- ..._kernels.hpp.inc => multigrid_kernels.cpp} | 34 ++ ...ch_criteria.hpp.inc => batch_criteria.hpp} | 21 + cuda/base/batch_multi_vector_kernels.cu | 56 -- cuda/base/device_matrix_data_kernels.cu | 31 -- cuda/base/kernel_launch.cuh | 56 -- cuda/base/kernel_launch_reduction.cuh | 28 - cuda/base/kernel_launch_solver.cuh | 24 - cuda/components/atomic.cuh | 28 - .../diagonal_block_manipulation.cuh | 31 -- cuda/components/intrinsics.cuh | 25 - cuda/components/merging.cuh | 28 - cuda/components/prefix_sum.cuh | 30 -- cuda/components/prefix_sum_kernels.cu | 30 -- cuda/components/reduction.cuh | 80 --- cuda/components/searching.cuh | 26 - cuda/components/segment_scan.cuh | 25 - cuda/components/sorting.cuh | 26 - cuda/components/syncfree.cuh | 31 -- cuda/components/thread_ids.cuh | 27 - cuda/components/warp_blas.cuh | 31 -- cuda/distributed/matrix_kernels.cu | 37 -- cuda/distributed/partition_helpers_kernels.cu | 27 - cuda/distributed/partition_kernels.cu | 31 -- cuda/distributed/vector_kernels.cu | 31 -- cuda/factorization/cholesky_kernels.cu | 111 ---- cuda/factorization/factorization_kernels.cu | 41 -- cuda/factorization/lu_kernels.cu | 46 -- cuda/factorization/par_ic_kernels.cu | 37 -- cuda/factorization/par_ilu_kernels.cu | 36 -- cuda/log/batch_logger.cuh | 27 - cuda/matrix/batch_csr_kernels.cu | 55 -- cuda/matrix/batch_dense_kernels.cu | 56 -- cuda/matrix/batch_ell_kernels.cu | 55 -- cuda/matrix/coo_kernels.cu | 47 -- cuda/matrix/dense_kernels.cu | 230 -------- cuda/matrix/diagonal_kernels.cu | 37 -- cuda/matrix/fbcsr_kernels.template.cu | 299 ----------- cuda/matrix/sellp_kernels.cu | 42 -- cuda/matrix/sparsity_csr_kernels.cu | 223 -------- cuda/multigrid/pgm_kernels.cu | 39 -- cuda/preconditioner/isai_kernels.cu | 47 -- cuda/preconditioner/jacobi_kernels.cu | 49 -- cuda/reorder/rcm_kernels.cu | 51 -- cuda/solver/cb_gmres_kernels.cu | 504 ------------------ cuda/solver/multigrid_kernels.cu | 39 -- cuda/stop/batch_criteria.cuh | 26 - hip/base/batch_multi_vector_kernels.hip.cpp | 56 -- hip/base/device_matrix_data_kernels.hip.cpp | 31 -- hip/base/kernel_launch.hip.hpp | 56 -- hip/base/kernel_launch_reduction.hip.hpp | 28 - hip/base/kernel_launch_solver.hip.hpp | 24 - hip/components/atomic.hip.hpp | 28 - .../diagonal_block_manipulation.hip.hpp | 31 -- hip/components/intrinsics.hip.hpp | 25 - hip/components/merging.hip.hpp | 28 - hip/components/prefix_sum.hip.hpp | 30 -- hip/components/prefix_sum_kernels.hip.cpp | 30 -- hip/components/reduction.hip.hpp | 80 --- hip/components/searching.hip.hpp | 26 - hip/components/segment_scan.hip.hpp | 25 - hip/components/sorting.hip.hpp | 26 - hip/components/syncfree.hip.hpp | 31 -- hip/components/thread_ids.hip.hpp | 27 - hip/components/warp_blas.hip.hpp | 31 -- hip/distributed/matrix_kernels.hip.cpp | 37 -- .../partition_helpers_kernels.hip.cpp | 27 - hip/distributed/partition_kernels.hip.cpp | 31 -- hip/distributed/vector_kernels.hip.cpp | 31 -- hip/factorization/cholesky_kernels.hip.cpp | 111 ---- .../factorization_kernels.hip.cpp | 41 -- hip/factorization/lu_kernels.hip.cpp | 46 -- hip/factorization/par_ic_kernels.hip.cpp | 37 -- hip/factorization/par_ilu_kernels.hip.cpp | 36 -- hip/log/batch_logger.hip.hpp | 26 - hip/matrix/batch_csr_kernels.hip.cpp | 55 -- hip/matrix/batch_dense_kernels.hip.cpp | 56 -- hip/matrix/batch_ell_kernels.hip.cpp | 55 -- hip/matrix/coo_kernels.hip.cpp | 47 -- hip/matrix/dense_kernels.hip.cpp | 230 -------- hip/matrix/diagonal_kernels.hip.cpp | 37 -- hip/matrix/ell_kernels.hip.cpp | 270 ---------- hip/matrix/fbcsr_kernels.template.hip.cpp | 299 ----------- hip/matrix/sellp_kernels.hip.cpp | 42 -- hip/multigrid/pgm_kernels.hip.cpp | 39 -- hip/preconditioner/isai_kernels.hip.cpp | 47 -- hip/preconditioner/jacobi_kernels.hip.cpp | 49 -- hip/reorder/rcm_kernels.hip.cpp | 51 -- hip/solver/cb_gmres_kernels.hip.cpp | 504 ------------------ hip/solver/idr_kernels.hip.cpp | 340 ------------ hip/solver/multigrid_kernels.hip.cpp | 39 -- hip/stop/batch_criteria.hip.hpp | 26 - 138 files changed, 2981 insertions(+), 6650 deletions(-) rename common/cuda_hip/base/{batch_multi_vector_kernels.hpp.inc => batch_multi_vector_kernels.cpp} (89%) rename common/cuda_hip/base/{device_matrix_data_kernels.hpp.inc => device_matrix_data_kernels.cpp} (88%) rename common/cuda_hip/base/{kernel_launch.hpp.inc => kernel_launch.hpp} (58%) rename common/cuda_hip/base/{kernel_launch_reduction.hpp.inc => kernel_launch_reduction.hpp} (97%) rename common/cuda_hip/base/{kernel_launch_solver.hpp.inc => kernel_launch_solver.hpp} (77%) rename common/cuda_hip/components/{atomic.hpp.inc => atomic.hpp} (95%) rename common/cuda_hip/components/{diagonal_block_manipulation.hpp.inc => diagonal_block_manipulation.hpp} (81%) rename common/cuda_hip/components/{intrinsics.hpp.inc => intrinsics.hpp} (74%) rename common/cuda_hip/components/{merging.hpp.inc => merging.hpp} (95%) rename common/cuda_hip/components/{prefix_sum.hpp.inc => prefix_sum.hpp} (91%) rename common/cuda_hip/components/{prefix_sum_kernels.hpp.inc => prefix_sum_kernels.cpp} (80%) rename common/cuda_hip/components/{reduction.hpp.inc => reduction.hpp} (78%) rename common/cuda_hip/components/{searching.hpp.inc => searching.hpp} (95%) rename common/cuda_hip/components/{segment_scan.hpp.inc => segment_scan.hpp} (73%) rename common/cuda_hip/components/{sorting.hpp.inc => sorting.hpp} (96%) rename common/cuda_hip/components/{syncfree.hpp.inc => syncfree.hpp} (86%) rename common/cuda_hip/components/{thread_ids.hpp.inc => thread_ids.hpp} (94%) rename common/cuda_hip/components/{warp_blas.hpp.inc => warp_blas.hpp} (97%) rename common/cuda_hip/distributed/{matrix_kernels.hpp.inc => matrix_kernels.cpp} (91%) rename common/cuda_hip/distributed/{partition_helpers_kernels.hpp.inc => partition_helpers_kernels.cpp} (70%) rename common/cuda_hip/distributed/{partition_kernels.hpp.inc => partition_kernels.cpp} (89%) rename common/cuda_hip/distributed/{vector_kernels.hpp.inc => vector_kernels.cpp} (84%) rename common/cuda_hip/factorization/{cholesky_kernels.hpp.inc => cholesky_kernels.cpp} (78%) rename common/cuda_hip/factorization/{factorization_kernels.hpp.inc => factorization_kernels.cpp} (95%) rename common/cuda_hip/factorization/{lu_kernels.hpp.inc => lu_kernels.cpp} (92%) rename common/cuda_hip/factorization/{par_ic_kernels.hpp.inc => par_ic_kernels.cpp} (84%) rename common/cuda_hip/factorization/{par_ilu_kernels.hpp.inc => par_ilu_kernels.cpp} (84%) rename common/cuda_hip/log/{batch_logger.hpp.inc => batch_logger.hpp} (67%) rename common/cuda_hip/matrix/{batch_csr_kernels.hpp.inc => batch_csr_kernels.cpp} (87%) rename common/cuda_hip/matrix/{batch_dense_kernels.hpp.inc => batch_dense_kernels.cpp} (89%) rename common/cuda_hip/matrix/{batch_ell_kernels.hpp.inc => batch_ell_kernels.cpp} (87%) rename common/cuda_hip/matrix/{coo_kernels.hpp.inc => coo_kernels.cpp} (91%) rename common/cuda_hip/matrix/{dense_kernels.hpp.inc => dense_kernels.cpp} (75%) rename common/cuda_hip/matrix/{diagonal_kernels.hpp.inc => diagonal_kernels.cpp} (73%) rename cuda/matrix/ell_kernels.cu => common/cuda_hip/matrix/ell_kernels.cpp (61%) delete mode 100644 common/cuda_hip/matrix/ell_kernels.hpp.inc rename common/cuda_hip/matrix/{fbcsr_kernels.hpp.inc => fbcsr_kernels.cpp} (57%) rename common/cuda_hip/matrix/{sellp_kernels.hpp.inc => sellp_kernels.cpp} (83%) rename hip/matrix/sparsity_csr_kernels.hip.cpp => common/cuda_hip/matrix/sparsity_csr_kernels.cpp (61%) delete mode 100644 common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc rename common/cuda_hip/multigrid/{pgm_kernels.hpp.inc => pgm_kernels.cpp} (77%) rename common/cuda_hip/preconditioner/{isai_kernels.hpp.inc => isai_kernels.cpp} (95%) rename common/cuda_hip/preconditioner/{jacobi_kernels.hpp.inc => jacobi_kernels.cpp} (91%) rename common/cuda_hip/reorder/{rcm_kernels.hpp.inc => rcm_kernels.cpp} (95%) rename common/cuda_hip/solver/{cb_gmres_kernels.hpp.inc => cb_gmres_kernels.cpp} (50%) rename cuda/solver/idr_kernels.cu => common/cuda_hip/solver/idr_kernels.cpp (52%) delete mode 100644 common/cuda_hip/solver/idr_kernels.hpp.inc rename common/cuda_hip/solver/{multigrid_kernels.hpp.inc => multigrid_kernels.cpp} (89%) rename common/cuda_hip/stop/{batch_criteria.hpp.inc => batch_criteria.hpp} (75%) delete mode 100644 cuda/base/batch_multi_vector_kernels.cu delete mode 100644 cuda/base/device_matrix_data_kernels.cu delete mode 100644 cuda/base/kernel_launch.cuh delete mode 100644 cuda/base/kernel_launch_reduction.cuh delete mode 100644 cuda/base/kernel_launch_solver.cuh delete mode 100644 cuda/components/atomic.cuh delete mode 100644 cuda/components/diagonal_block_manipulation.cuh delete mode 100644 cuda/components/intrinsics.cuh delete mode 100644 cuda/components/merging.cuh delete mode 100644 cuda/components/prefix_sum.cuh delete mode 100644 cuda/components/prefix_sum_kernels.cu delete mode 100644 cuda/components/reduction.cuh delete mode 100644 cuda/components/searching.cuh delete mode 100644 cuda/components/segment_scan.cuh delete mode 100644 cuda/components/sorting.cuh delete mode 100644 cuda/components/syncfree.cuh delete mode 100644 cuda/components/thread_ids.cuh delete mode 100644 cuda/components/warp_blas.cuh delete mode 100644 cuda/distributed/matrix_kernels.cu delete mode 100644 cuda/distributed/partition_helpers_kernels.cu delete mode 100644 cuda/distributed/partition_kernels.cu delete mode 100644 cuda/distributed/vector_kernels.cu delete mode 100644 cuda/factorization/cholesky_kernels.cu delete mode 100644 cuda/factorization/factorization_kernels.cu delete mode 100644 cuda/factorization/lu_kernels.cu delete mode 100644 cuda/factorization/par_ic_kernels.cu delete mode 100644 cuda/factorization/par_ilu_kernels.cu delete mode 100644 cuda/log/batch_logger.cuh delete mode 100644 cuda/matrix/batch_csr_kernels.cu delete mode 100644 cuda/matrix/batch_dense_kernels.cu delete mode 100644 cuda/matrix/batch_ell_kernels.cu delete mode 100644 cuda/matrix/coo_kernels.cu delete mode 100644 cuda/matrix/dense_kernels.cu delete mode 100644 cuda/matrix/diagonal_kernels.cu delete mode 100644 cuda/matrix/fbcsr_kernels.template.cu delete mode 100644 cuda/matrix/sellp_kernels.cu delete mode 100644 cuda/matrix/sparsity_csr_kernels.cu delete mode 100644 cuda/multigrid/pgm_kernels.cu delete mode 100644 cuda/preconditioner/isai_kernels.cu delete mode 100644 cuda/preconditioner/jacobi_kernels.cu delete mode 100644 cuda/reorder/rcm_kernels.cu delete mode 100644 cuda/solver/cb_gmres_kernels.cu delete mode 100644 cuda/solver/multigrid_kernels.cu delete mode 100644 cuda/stop/batch_criteria.cuh delete mode 100644 hip/base/batch_multi_vector_kernels.hip.cpp delete mode 100644 hip/base/device_matrix_data_kernels.hip.cpp delete mode 100644 hip/base/kernel_launch.hip.hpp delete mode 100644 hip/base/kernel_launch_reduction.hip.hpp delete mode 100644 hip/base/kernel_launch_solver.hip.hpp delete mode 100644 hip/components/atomic.hip.hpp delete mode 100644 hip/components/diagonal_block_manipulation.hip.hpp delete mode 100644 hip/components/intrinsics.hip.hpp delete mode 100644 hip/components/merging.hip.hpp delete mode 100644 hip/components/prefix_sum.hip.hpp delete mode 100644 hip/components/prefix_sum_kernels.hip.cpp delete mode 100644 hip/components/reduction.hip.hpp delete mode 100644 hip/components/searching.hip.hpp delete mode 100644 hip/components/segment_scan.hip.hpp delete mode 100644 hip/components/sorting.hip.hpp delete mode 100644 hip/components/syncfree.hip.hpp delete mode 100644 hip/components/thread_ids.hip.hpp delete mode 100644 hip/components/warp_blas.hip.hpp delete mode 100644 hip/distributed/matrix_kernels.hip.cpp delete mode 100644 hip/distributed/partition_helpers_kernels.hip.cpp delete mode 100644 hip/distributed/partition_kernels.hip.cpp delete mode 100644 hip/distributed/vector_kernels.hip.cpp delete mode 100644 hip/factorization/cholesky_kernels.hip.cpp delete mode 100644 hip/factorization/factorization_kernels.hip.cpp delete mode 100644 hip/factorization/lu_kernels.hip.cpp delete mode 100644 hip/factorization/par_ic_kernels.hip.cpp delete mode 100644 hip/factorization/par_ilu_kernels.hip.cpp delete mode 100644 hip/log/batch_logger.hip.hpp delete mode 100644 hip/matrix/batch_csr_kernels.hip.cpp delete mode 100644 hip/matrix/batch_dense_kernels.hip.cpp delete mode 100644 hip/matrix/batch_ell_kernels.hip.cpp delete mode 100644 hip/matrix/coo_kernels.hip.cpp delete mode 100644 hip/matrix/dense_kernels.hip.cpp delete mode 100644 hip/matrix/diagonal_kernels.hip.cpp delete mode 100644 hip/matrix/ell_kernels.hip.cpp delete mode 100644 hip/matrix/fbcsr_kernels.template.hip.cpp delete mode 100644 hip/matrix/sellp_kernels.hip.cpp delete mode 100644 hip/multigrid/pgm_kernels.hip.cpp delete mode 100644 hip/preconditioner/isai_kernels.hip.cpp delete mode 100644 hip/preconditioner/jacobi_kernels.hip.cpp delete mode 100644 hip/reorder/rcm_kernels.hip.cpp delete mode 100644 hip/solver/cb_gmres_kernels.hip.cpp delete mode 100644 hip/solver/idr_kernels.hip.cpp delete mode 100644 hip/solver/multigrid_kernels.hip.cpp delete mode 100644 hip/stop/batch_criteria.hip.hpp diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.cpp similarity index 89% rename from common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc rename to common/cuda_hip/base/batch_multi_vector_kernels.cpp index 9b6301674be..0261dbb97ce 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp @@ -2,6 +2,47 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/base/batch_multi_vector_kernels.hpp" + +#include +#include + +#include +#include + +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The MultiVector matrix format namespace. + * + * @ingroup batch_multi_vector + */ +namespace batch_multi_vector { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + + template __device__ __forceinline__ void scale( const gko::batch::multi_vector::batch_item& alpha, @@ -299,3 +340,14 @@ __launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel( copy(src_b, dst_b); } } + + +#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_multi_vector +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc b/common/cuda_hip/base/device_matrix_data_kernels.cpp similarity index 88% rename from common/cuda_hip/base/device_matrix_data_kernels.hpp.inc rename to common/cuda_hip/base/device_matrix_data_kernels.cpp index 70cbd9e7391..61a7a6281a9 100644 --- a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc +++ b/common/cuda_hip/base/device_matrix_data_kernels.cpp @@ -2,6 +2,26 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/base/device_matrix_data_kernels.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace components { + + template void remove_zeros(std::shared_ptr exec, array& values, array& row_idxs, @@ -99,3 +119,9 @@ void sort_row_major(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL); + + +} // namespace components +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/base/kernel_launch.hpp.inc b/common/cuda_hip/base/kernel_launch.hpp similarity index 58% rename from common/cuda_hip/base/kernel_launch.hpp.inc rename to common/cuda_hip/base/kernel_launch.hpp index c46e6c879cb..dd20eb5769f 100644 --- a/common/cuda_hip/base/kernel_launch.hpp.inc +++ b/common/cuda_hip/base/kernel_launch.hpp @@ -2,6 +2,52 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_HPP_ +#error \ + "This file can only be used from inside common/unified/base/kernel_launch.hpp" +#endif + + +#include + +#include "accessor/cuda_hip_helper.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + +template +struct to_device_type_impl&> { + using type = std::decay_t>()))>; + static type map_to_device(gko::acc::range& range) + { + return gko::acc::as_device_range(range); + } +}; + +template +struct to_device_type_impl&> { + using type = std::decay_t>()))>; + static type map_to_device(const gko::acc::range& range) + { + return gko::acc::as_device_range(range); + } +}; + + +namespace device_std = thrust; + + +constexpr int default_block_size = 512; + + template __global__ __launch_bounds__(default_block_size) void generic_kernel_1d( int64 size, KernelFunction fn, KernelArgs... args) @@ -52,3 +98,8 @@ void run_kernel(std::shared_ptr exec, KernelFunction fn, map_to_device(args)...); } } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc b/common/cuda_hip/base/kernel_launch_reduction.hpp similarity index 97% rename from common/cuda_hip/base/kernel_launch_reduction.hpp.inc rename to common/cuda_hip/base/kernel_launch_reduction.hpp index e5caedacb1f..86e082ac2c1 100644 --- a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc +++ b/common/cuda_hip/base/kernel_launch_reduction.hpp @@ -2,6 +2,24 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ +#error \ + "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp" +#endif + + +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/synthesizer/implementation_selection.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + template __global__ __launch_bounds__( @@ -505,3 +523,8 @@ void run_kernel_col_reduction_cached( } } } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/base/kernel_launch_solver.hpp.inc b/common/cuda_hip/base/kernel_launch_solver.hpp similarity index 77% rename from common/cuda_hip/base/kernel_launch_solver.hpp.inc rename to common/cuda_hip/base/kernel_launch_solver.hpp index cef3c8a3adc..742da85fd96 100644 --- a/common/cuda_hip/base/kernel_launch_solver.hpp.inc +++ b/common/cuda_hip/base/kernel_launch_solver.hpp @@ -2,6 +2,20 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_SOLVER_HPP_ +#error \ + "This file can only be used from inside common/unified/base/kernel_launch_solver.hpp" +#endif + + +#include "common/cuda_hip/base/runtime.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + template __global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver( int64 rows, int64 cols, int64 default_stride, KernelFunction fn, @@ -32,3 +46,8 @@ void run_kernel_solver(std::shared_ptr exec, static_cast(default_stride), fn, map_to_device(args)...); } } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp similarity index 95% rename from common/cuda_hip/components/atomic.hpp.inc rename to common/cuda_hip/components/atomic.hpp index 60eaf5a9dd9..e0384222734 100644 --- a/common/cuda_hip/components/atomic.hpp.inc +++ b/common/cuda_hip/components/atomic.hpp @@ -2,6 +2,21 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_INC_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_INC_ + + +#include + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/types.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + namespace detail { @@ -228,3 +243,11 @@ __forceinline__ __device__ thrust::complex atomic_add( auto imag = atomic_add(addr + 1, val.imag()); return {real, imag}; } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_INC_ \ No newline at end of file diff --git a/common/cuda_hip/components/diagonal_block_manipulation.hpp.inc b/common/cuda_hip/components/diagonal_block_manipulation.hpp similarity index 81% rename from common/cuda_hip/components/diagonal_block_manipulation.hpp.inc rename to common/cuda_hip/components/diagonal_block_manipulation.hpp index a8e7004b5aa..5c0be150d21 100644 --- a/common/cuda_hip/components/diagonal_block_manipulation.hpp.inc +++ b/common/cuda_hip/components/diagonal_block_manipulation.hpp @@ -2,6 +2,23 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_INC_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_INC_ + + +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace csr { + + /** * @internal * @@ -63,3 +80,12 @@ __device__ __forceinline__ void extract_transposed_diag_blocks( } } } + + +} // namespace csr +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_INC_ \ No newline at end of file diff --git a/common/cuda_hip/components/intrinsics.hpp.inc b/common/cuda_hip/components/intrinsics.hpp similarity index 74% rename from common/cuda_hip/components/intrinsics.hpp.inc rename to common/cuda_hip/components/intrinsics.hpp index 3fc28cee871..398e4325cc2 100644 --- a/common/cuda_hip/components/intrinsics.hpp.inc +++ b/common/cuda_hip/components/intrinsics.hpp @@ -2,6 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_INC_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_INC_ + + +#include + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + /** * @internal * Returns the number of set bits in the given mask. @@ -36,3 +48,11 @@ __forceinline__ __device__ int clz(uint32 mask) { return __clz(mask); } /** @copydoc clz */ __forceinline__ __device__ int clz(uint64 mask) { return __clzll(mask); } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_INC_ \ No newline at end of file diff --git a/common/cuda_hip/components/merging.hpp.inc b/common/cuda_hip/components/merging.hpp similarity index 95% rename from common/cuda_hip/components/merging.hpp.inc rename to common/cuda_hip/components/merging.hpp index d77707795a1..b1bca2a0c78 100644 --- a/common/cuda_hip/components/merging.hpp.inc +++ b/common/cuda_hip/components/merging.hpp @@ -2,6 +2,21 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_INC_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_INC_ + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "core/base/utils.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + namespace detail { @@ -280,3 +295,11 @@ __forceinline__ __device__ void sequential_match(const ValueType* a, return a_idx < a_size && b_idx < b_size; }); } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_INC_ \ No newline at end of file diff --git a/common/cuda_hip/components/prefix_sum.hpp.inc b/common/cuda_hip/components/prefix_sum.hpp similarity index 91% rename from common/cuda_hip/components/prefix_sum.hpp.inc rename to common/cuda_hip/components/prefix_sum.hpp index 474b0b88cd1..8fc5bbe63b0 100644 --- a/common/cuda_hip/components/prefix_sum.hpp.inc +++ b/common/cuda_hip/components/prefix_sum.hpp @@ -2,6 +2,23 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_INC_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_INC_ + + +#include + +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + /** * @internal * Computes the prefix sum and total sum of `element` over a subwarp. @@ -158,3 +175,11 @@ __global__ __launch_bounds__(block_size) void finalize_prefix_sum( elements[tidx] += prefix_block_sum; } } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_INC_ \ No newline at end of file diff --git a/common/cuda_hip/components/prefix_sum_kernels.hpp.inc b/common/cuda_hip/components/prefix_sum_kernels.cpp similarity index 80% rename from common/cuda_hip/components/prefix_sum_kernels.hpp.inc rename to common/cuda_hip/components/prefix_sum_kernels.cpp index c232e115a22..40cb1bc48fc 100644 --- a/common/cuda_hip/components/prefix_sum_kernels.hpp.inc +++ b/common/cuda_hip/components/prefix_sum_kernels.cpp @@ -2,6 +2,25 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/components/prefix_sum_kernels.hpp" + +#include + +#include + +#include +#include +#include + +#include "common/cuda_hip/base/thrust.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace components { + + template struct overflowing_sum { constexpr static IndexType max = std::numeric_limits::max(); @@ -56,3 +75,9 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_NONNEGATIVE_KERNEL); // instantiate for size_type as well, as this is used in the Sellp format template void prefix_sum_nonnegative( std::shared_ptr, size_type*, size_type); + + +} // namespace components +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/components/reduction.hpp.inc b/common/cuda_hip/components/reduction.hpp similarity index 78% rename from common/cuda_hip/components/reduction.hpp.inc rename to common/cuda_hip/components/reduction.hpp index 1a6a64d6fb7..d2889bb9c7e 100644 --- a/common/cuda_hip/components/reduction.hpp.inc +++ b/common/cuda_hip/components/reduction.hpp @@ -2,6 +2,32 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_INC_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_INC_ + + +#include + +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/array_access.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + +constexpr int default_reduce_block_size = 512; + + /** * @internal * @@ -222,3 +248,52 @@ __launch_bounds__(default_reduce_block_size) void reduce_add_array_with_initial_ result[blockIdx.x] += block_sum[0]; } } + + +/** + * Compute a reduction using add operation (+). + * + * @param exec Executor associated to the array + * @param size size of the array + * @param source the pointer of the array + * + * @return the reduction result + */ +template +__host__ ValueType reduce_add_array(std::shared_ptr exec, + size_type size, const ValueType* source) +{ + auto block_results_val = source; + size_type grid_dim = size; + auto block_results = array(exec); + if (size > default_reduce_block_size) { + const auto n = ceildiv(size, default_reduce_block_size); + grid_dim = + (n <= default_reduce_block_size) ? n : default_reduce_block_size; + + block_results.resize_and_reset(grid_dim); + + reduce_add_array<<get_stream()>>>( + size, as_device_type(source), + as_device_type(block_results.get_data())); + + block_results_val = block_results.get_const_data(); + } + + auto d_result = array(exec, 1); + + reduce_add_array<<<1, default_reduce_block_size, 0, exec->get_stream()>>>( + grid_dim, as_device_type(block_results_val), + as_device_type(d_result.get_data())); + auto answer = get_element(d_result, 0); + return answer; +} + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_INC_ \ No newline at end of file diff --git a/common/cuda_hip/components/searching.hpp.inc b/common/cuda_hip/components/searching.hpp similarity index 95% rename from common/cuda_hip/components/searching.hpp.inc rename to common/cuda_hip/components/searching.hpp index a0f842dca35..599e7a8581c 100644 --- a/common/cuda_hip/components/searching.hpp.inc +++ b/common/cuda_hip/components/searching.hpp @@ -2,6 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_INC_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_INC_ + + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + /** * @internal * Generic binary search that finds the first index where a predicate is true. @@ -208,3 +221,11 @@ __forceinline__ __device__ IndexType group_ary_search(IndexType offset, auto pos = mask == 0 ? group.size() : ffs(mask) - 1; return offset + pos; } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_INC_ \ No newline at end of file diff --git a/common/cuda_hip/components/segment_scan.hpp.inc b/common/cuda_hip/components/segment_scan.hpp similarity index 73% rename from common/cuda_hip/components/segment_scan.hpp.inc rename to common/cuda_hip/components/segment_scan.hpp index 75cc0654531..d2f992850ef 100644 --- a/common/cuda_hip/components/segment_scan.hpp.inc +++ b/common/cuda_hip/components/segment_scan.hpp @@ -2,6 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_INC_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_INC_ + + +#include "common/cuda_hip/components/cooperative_groups.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + /** * @internal * @@ -33,3 +45,11 @@ __device__ __forceinline__ bool segment_scan( } return head; } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_INC_ \ No newline at end of file diff --git a/common/cuda_hip/components/sorting.hpp.inc b/common/cuda_hip/components/sorting.hpp similarity index 96% rename from common/cuda_hip/components/sorting.hpp.inc rename to common/cuda_hip/components/sorting.hpp index 10db7eb6daa..ecc9c5289f9 100644 --- a/common/cuda_hip/components/sorting.hpp.inc +++ b/common/cuda_hip/components/sorting.hpp @@ -2,6 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_INC_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_INC_ + + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + namespace detail { @@ -291,3 +304,11 @@ __forceinline__ __device__ void bitonic_sort(ValueType* local_elements, local_elements, false); } } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_INC_ \ No newline at end of file diff --git a/common/cuda_hip/components/syncfree.hpp.inc b/common/cuda_hip/components/syncfree.hpp similarity index 86% rename from common/cuda_hip/components/syncfree.hpp.inc rename to common/cuda_hip/components/syncfree.hpp index f0d0bbe4d22..3c82c916a21 100644 --- a/common/cuda_hip/components/syncfree.hpp.inc +++ b/common/cuda_hip/components/syncfree.hpp @@ -2,6 +2,24 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_INC_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_INC_ + + +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/memory.hpp" +#include "core/components/fill_array_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + struct syncfree_storage { using status_word = int; @@ -110,3 +128,11 @@ class syncfree_scheduler { IndexType work_id; IndexType block_id; }; + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_INC_ \ No newline at end of file diff --git a/common/cuda_hip/components/thread_ids.hpp.inc b/common/cuda_hip/components/thread_ids.hpp similarity index 94% rename from common/cuda_hip/components/thread_ids.hpp.inc rename to common/cuda_hip/components/thread_ids.hpp index 1befa428f3c..4fef650f51c 100644 --- a/common/cuda_hip/components/thread_ids.hpp.inc +++ b/common/cuda_hip/components/thread_ids.hpp @@ -2,6 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_INC_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_INC_ + + +#include "common/cuda_hip/base/config.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace thread { + + /** * @internal * @@ -242,3 +255,12 @@ __device__ __forceinline__ IndexType get_subwarp_num_flat() "subwarp_size must be a power of two"); return blockDim.x / subwarp_size * static_cast(gridDim.x); } + + +} // namespace thread +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_INC_ \ No newline at end of file diff --git a/common/cuda_hip/components/warp_blas.hpp.inc b/common/cuda_hip/components/warp_blas.hpp similarity index 97% rename from common/cuda_hip/components/warp_blas.hpp.inc rename to common/cuda_hip/components/warp_blas.hpp index 61b2ae25e7f..1f25bb61634 100644 --- a/common/cuda_hip/components/warp_blas.hpp.inc +++ b/common/cuda_hip/components/warp_blas.hpp @@ -2,6 +2,24 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_INC_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_INC_ + + +#include +#include + +#include + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/reduction.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + /** * @internal * @@ -409,3 +427,11 @@ __device__ __forceinline__ remove_complex compute_infinity_norm( return reduce(group, sum, [](result_type x, result_type y) { return max(x, y); }); } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_INC_ \ No newline at end of file diff --git a/common/cuda_hip/distributed/matrix_kernels.hpp.inc b/common/cuda_hip/distributed/matrix_kernels.cpp similarity index 91% rename from common/cuda_hip/distributed/matrix_kernels.hpp.inc rename to common/cuda_hip/distributed/matrix_kernels.cpp index 8848e490c18..6b5f997d153 100644 --- a/common/cuda_hip/distributed/matrix_kernels.hpp.inc +++ b/common/cuda_hip/distributed/matrix_kernels.cpp @@ -2,6 +2,32 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/distributed/matrix_kernels.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/atomic.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace distributed_matrix { + + template struct input_type { GlobalIndexType row; @@ -170,3 +196,9 @@ void separate_local_nonlocal( GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL); + + +} // namespace distributed_matrix +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc b/common/cuda_hip/distributed/partition_helpers_kernels.cpp similarity index 70% rename from common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc rename to common/cuda_hip/distributed/partition_helpers_kernels.cpp index 88343370d99..cd1419230d2 100644 --- a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc +++ b/common/cuda_hip/distributed/partition_helpers_kernels.cpp @@ -2,6 +2,22 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/distributed/partition_helpers_kernels.hpp" + +#include +#include +#include +#include + +#include "common/cuda_hip/base/thrust.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace partition_helpers { + + template void sort_by_range_start( std::shared_ptr exec, @@ -24,3 +40,9 @@ void sort_by_range_start( GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); + + +} // namespace partition_helpers +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/distributed/partition_kernels.hpp.inc b/common/cuda_hip/distributed/partition_kernels.cpp similarity index 89% rename from common/cuda_hip/distributed/partition_kernels.hpp.inc rename to common/cuda_hip/distributed/partition_kernels.cpp index 20f3ebd47dc..b4e051b97f5 100644 --- a/common/cuda_hip/distributed/partition_kernels.hpp.inc +++ b/common/cuda_hip/distributed/partition_kernels.cpp @@ -2,6 +2,26 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/distributed/partition_kernels.hpp" + +#include +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/thrust.hpp" +#include "common/unified/base/kernel_launch.hpp" +#include "core/components/fill_array_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace partition { + + namespace kernel { @@ -110,3 +130,9 @@ void build_starting_indices(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_PARTITION_BUILD_STARTING_INDICES); + + +} // namespace partition +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/distributed/vector_kernels.hpp.inc b/common/cuda_hip/distributed/vector_kernels.cpp similarity index 84% rename from common/cuda_hip/distributed/vector_kernels.hpp.inc rename to common/cuda_hip/distributed/vector_kernels.cpp index 6a0497db78a..91bd838497d 100644 --- a/common/cuda_hip/distributed/vector_kernels.hpp.inc +++ b/common/cuda_hip/distributed/vector_kernels.cpp @@ -2,6 +2,26 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/distributed/vector_kernels.hpp" + +#include +#include +#include +#include +#include +#include + +#include + +#include "common/cuda_hip/base/thrust.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace distributed_vector { + + template void build_local( std::shared_ptr exec, @@ -65,3 +85,9 @@ void build_local( GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL); + + +} // namespace distributed_vector +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/factorization/cholesky_kernels.hpp.inc b/common/cuda_hip/factorization/cholesky_kernels.cpp similarity index 78% rename from common/cuda_hip/factorization/cholesky_kernels.hpp.inc rename to common/cuda_hip/factorization/cholesky_kernels.cpp index e6220019d22..6e6be7b81fd 100644 --- a/common/cuda_hip/factorization/cholesky_kernels.hpp.inc +++ b/common/cuda_hip/factorization/cholesky_kernels.cpp @@ -2,6 +2,49 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/factorization/cholesky_kernels.hpp" + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/syncfree.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" +#include "core/factorization/elimination_forest.hpp" +#include "core/factorization/lu_kernels.hpp" +#include "core/matrix/csr_lookup.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Cholesky namespace. + * + * @ingroup factor + */ +namespace cholesky { + + +constexpr int default_block_size = 512; + + #include "core/factorization/elimination_forest.hpp" namespace kernel { @@ -330,3 +373,66 @@ void factorize(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE); + + +template +void symbolic_count(std::shared_ptr exec, + const matrix::Csr* mtx, + const factorization::elimination_forest& forest, + IndexType* row_nnz, array& tmp_storage) +{ + const auto num_rows = static_cast(mtx->get_size()[0]); + if (num_rows == 0) { + return; + } + const auto mtx_nnz = static_cast(mtx->get_num_stored_elements()); + tmp_storage.resize_and_reset(mtx_nnz + num_rows); + const auto postorder_cols = tmp_storage.get_data(); + const auto lower_ends = postorder_cols + mtx_nnz; + const auto row_ptrs = mtx->get_const_row_ptrs(); + const auto cols = mtx->get_const_col_idxs(); + const auto inv_postorder = forest.inv_postorder.get_const_data(); + const auto postorder_parent = forest.postorder_parents.get_const_data(); + // transform col indices to postorder indices + { + const auto num_blocks = ceildiv(num_rows, default_block_size); + kernel::build_postorder_cols<<get_stream()>>>( + num_rows, cols, row_ptrs, inv_postorder, postorder_cols, + lower_ends); + } + // sort postorder_cols inside rows + { + const auto handle = exec->get_sparselib_handle(); + auto descr = sparselib::create_mat_descr(); + array permutation_array(exec, mtx_nnz); + auto permutation = permutation_array.get_data(); + components::fill_seq_array(exec, permutation, mtx_nnz); + size_type buffer_size{}; + sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz, + row_ptrs, postorder_cols, buffer_size); + array buffer_array{exec, buffer_size}; + auto buffer = buffer_array.get_data(); + sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs, + postorder_cols, permutation, buffer); + sparselib::destroy(descr); + } + // count nonzeros per row of L + { + const auto num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + kernel::symbolic_count + <<get_stream()>>>( + num_rows, row_ptrs, lower_ends, inv_postorder, postorder_cols, + postorder_parent, row_nnz); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT); + + +} // namespace cholesky +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/factorization/factorization_kernels.hpp.inc b/common/cuda_hip/factorization/factorization_kernels.cpp similarity index 95% rename from common/cuda_hip/factorization/factorization_kernels.hpp.inc rename to common/cuda_hip/factorization/factorization_kernels.cpp index 806797e60d8..da2666feb25 100644 --- a/common/cuda_hip/factorization/factorization_kernels.hpp.inc +++ b/common/cuda_hip/factorization/factorization_kernels.cpp @@ -2,6 +2,36 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/factorization/factorization_kernels.hpp" + +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/base/array_access.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/csr_builder.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The factorization namespace. + * + * @ingroup factor + */ +namespace factorization { + + +constexpr int default_block_size{512}; + + namespace kernel { @@ -520,3 +550,9 @@ void initialize_l(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL); + + +} // namespace factorization +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/factorization/lu_kernels.hpp.inc b/common/cuda_hip/factorization/lu_kernels.cpp similarity index 92% rename from common/cuda_hip/factorization/lu_kernels.hpp.inc rename to common/cuda_hip/factorization/lu_kernels.cpp index f8f317bc6a5..71d09e93ef7 100644 --- a/common/cuda_hip/factorization/lu_kernels.hpp.inc +++ b/common/cuda_hip/factorization/lu_kernels.cpp @@ -2,6 +2,41 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/factorization/lu_kernels.hpp" + +#include +#include + +#include +#include +#include + +#include + +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/syncfree.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/base/allocator.hpp" +#include "core/matrix/csr_lookup.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The LU namespace. + * + * @ingroup factor + */ +namespace lu_factorization { + + +constexpr static int default_block_size = 512; + + namespace kernel { @@ -301,3 +336,9 @@ void symbolic_factorize_simple_finalize( GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE); + + +} // namespace lu_factorization +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/factorization/par_ic_kernels.hpp.inc b/common/cuda_hip/factorization/par_ic_kernels.cpp similarity index 84% rename from common/cuda_hip/factorization/par_ic_kernels.hpp.inc rename to common/cuda_hip/factorization/par_ic_kernels.cpp index dd30eb2fc1c..7102d782b94 100644 --- a/common/cuda_hip/factorization/par_ic_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ic_kernels.cpp @@ -2,6 +2,32 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/factorization/par_ic_kernels.hpp" + +#include +#include +#include + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/memory.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The parallel ic factorization namespace. + * + * @ingroup factor + */ +namespace par_ic_factorization { + + +constexpr int default_block_size = 512; + + namespace kernel { @@ -111,3 +137,9 @@ void compute_factor(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL); + + +} // namespace par_ic_factorization +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilu_kernels.cpp similarity index 84% rename from common/cuda_hip/factorization/par_ilu_kernels.hpp.inc rename to common/cuda_hip/factorization/par_ilu_kernels.cpp index 1029c0d08f6..447fdb99c2c 100644 --- a/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ilu_kernels.cpp @@ -2,6 +2,31 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/factorization/par_ilu_kernels.hpp" + +#include + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/memory.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The parallel ilu factorization namespace. + * + * @ingroup factor + */ +namespace par_ilu_factorization { + + +constexpr int default_block_size{512}; + + namespace kernel { @@ -85,3 +110,9 @@ void compute_l_u_factors(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL); + + +} // namespace par_ilu_factorization +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/log/batch_logger.hpp.inc b/common/cuda_hip/log/batch_logger.hpp similarity index 67% rename from common/cuda_hip/log/batch_logger.hpp.inc rename to common/cuda_hip/log/batch_logger.hpp index 04b614b50f9..bca07fb9c37 100644 --- a/common/cuda_hip/log/batch_logger.hpp.inc +++ b/common/cuda_hip/log/batch_logger.hpp @@ -2,6 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_INC_ +#define GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_INC_ + + +#include + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_log { + + /** * @see reference/log/batch_logger.hpp */ @@ -28,3 +41,12 @@ class SimpleFinalLogger final { real_type* const final_residuals_; idx_type* const final_iters_; }; + + +} // namespace batch_log +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_INC_ \ No newline at end of file diff --git a/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc b/common/cuda_hip/matrix/batch_csr_kernels.cpp similarity index 87% rename from common/cuda_hip/matrix/batch_csr_kernels.hpp.inc rename to common/cuda_hip/matrix/batch_csr_kernels.cpp index e041dadaa3e..01edb0e1310 100644 --- a/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/batch_csr_kernels.cpp @@ -2,6 +2,46 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/matrix/batch_csr_kernels.hpp" + +#include + +#include +#include +#include + +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Csr matrix format namespace. + * @ref Csr + * @ingroup batch_csr + */ +namespace batch_csr { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + + template __device__ __forceinline__ void simple_apply( const gko::batch::matrix::csr::batch_item& mat, @@ -196,3 +236,14 @@ __global__ void add_scaled_identity_kernel( add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b); } } + + +#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_csr +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.cpp similarity index 89% rename from common/cuda_hip/matrix/batch_dense_kernels.hpp.inc rename to common/cuda_hip/matrix/batch_dense_kernels.cpp index f8abf9131a1..90cafc5d1ca 100644 --- a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc +++ b/common/cuda_hip/matrix/batch_dense_kernels.cpp @@ -2,6 +2,46 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/matrix/batch_dense_kernels.hpp" + +#include + +#include +#include +#include + +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup batch_dense + */ +namespace batch_dense { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + + template __device__ __forceinline__ void simple_apply( const gko::batch::matrix::dense::batch_item& mat, @@ -243,3 +283,15 @@ __global__ void add_scaled_identity_kernel( add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b); } } + + +#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" + + +// clang-format on + + +} // namespace batch_dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.cpp similarity index 87% rename from common/cuda_hip/matrix/batch_ell_kernels.hpp.inc rename to common/cuda_hip/matrix/batch_ell_kernels.cpp index 0a6d1927c96..c5e27e9d1d1 100644 --- a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc +++ b/common/cuda_hip/matrix/batch_ell_kernels.cpp @@ -2,6 +2,46 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/matrix/batch_ell_kernels.hpp" + +#include + +#include +#include +#include + +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Ell matrix format namespace. + * @ref Ell + * @ingroup batch_ell + */ +namespace batch_ell { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + + template __device__ __forceinline__ void simple_apply( const gko::batch::matrix::ell::batch_item& mat, @@ -205,3 +245,14 @@ __global__ void add_scaled_identity_kernel( add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b); } } + + +#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_ell +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/matrix/coo_kernels.hpp.inc b/common/cuda_hip/matrix/coo_kernels.cpp similarity index 91% rename from common/cuda_hip/matrix/coo_kernels.hpp.inc rename to common/cuda_hip/matrix/coo_kernels.cpp index 98332f6cd7b..00ab983bc9f 100644 --- a/common/cuda_hip/matrix/coo_kernels.hpp.inc +++ b/common/cuda_hip/matrix/coo_kernels.cpp @@ -2,6 +2,42 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/matrix/coo_kernels.hpp" + +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/matrix/dense_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Coordinate matrix format namespace. + * + * @ingroup coo + */ +namespace coo { + + +constexpr int warps_in_block = 4; +constexpr int spmv_block_size = warps_in_block * config::warp_size; + + namespace { @@ -304,3 +340,9 @@ void advanced_spmv2(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); + + +} // namespace coo +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.cpp similarity index 75% rename from common/cuda_hip/matrix/dense_kernels.hpp.inc rename to common/cuda_hip/matrix/dense_kernels.cpp index b48d2c4ff4f..b44c0396823 100644 --- a/common/cuda_hip/matrix/dense_kernels.hpp.inc +++ b/common/cuda_hip/matrix/dense_kernels.cpp @@ -2,6 +2,46 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/matrix/dense_kernels.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/utils.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +constexpr int default_block_size = 512; + + namespace kernel { @@ -619,3 +659,188 @@ void convert_to_sparsity_csr(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL); + + +template +void compute_dot_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result, array& tmp) +{ + if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + blas::dot(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), y->get_const_values(), y->get_stride(), + result->get_values()); + } else { + compute_dot(exec, x, y, result, tmp); + } + } else { + compute_dot(exec, x, y, result, tmp); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); + + +template +void compute_conj_dot_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result, + array& tmp) +{ + if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + blas::conj_dot(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), y->get_const_values(), + y->get_stride(), result->get_values()); + } else { + compute_conj_dot(exec, x, y, result, tmp); + } + } else { + compute_conj_dot(exec, x, y, result, tmp); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); + + +template +void compute_norm2_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result, + array& tmp) +{ + if (x->get_size()[1] == 1) { + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + blas::norm2(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), result->get_values()); + } else { + compute_norm2(exec, x, result, tmp); + } + } else { + compute_norm2(exec, x, result, tmp); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); + + +template +void simple_apply(std::shared_ptr exec, + const matrix::Dense* a, + const matrix::Dense* b, + matrix::Dense* c) +{ + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { + if (a->get_size()[1] > 0) { + blas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1], + c->get_size()[0], a->get_size()[1], &alpha, + b->get_const_values(), b->get_stride(), + a->get_const_values(), a->get_stride(), &beta, + c->get_values(), c->get_stride()); + } else { + dense::fill(exec, c, zero()); + } + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); + + +template +void apply(std::shared_ptr exec, + const matrix::Dense* alpha, + const matrix::Dense* a, const matrix::Dense* b, + const matrix::Dense* beta, matrix::Dense* c) +{ + if (blas::is_supported::value) { + if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { + if (a->get_size()[1] > 0) { + blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N, + c->get_size()[1], c->get_size()[0], a->get_size()[1], + alpha->get_const_values(), b->get_const_values(), + b->get_stride(), a->get_const_values(), + a->get_stride(), beta->get_const_values(), + c->get_values(), c->get_stride()); + } else { + dense::scale(exec, beta, c); + } + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); + + +template +void transpose(std::shared_ptr exec, + const matrix::Dense* orig, + matrix::Dense* trans) +{ + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { + blas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0], + orig->get_size()[1], &alpha, orig->get_const_values(), + orig->get_stride(), &beta, trans->get_const_values(), + trans->get_stride(), trans->get_values(), + trans->get_stride()); + } + } else { + GKO_NOT_IMPLEMENTED; + } +}; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); + + +template +void conj_transpose(std::shared_ptr exec, + const matrix::Dense* orig, + matrix::Dense* trans) +{ + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { + blas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0], + orig->get_size()[1], &alpha, orig->get_const_values(), + orig->get_stride(), &beta, trans->get_const_values(), + trans->get_stride(), trans->get_values(), + trans->get_stride()); + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/matrix/diagonal_kernels.hpp.inc b/common/cuda_hip/matrix/diagonal_kernels.cpp similarity index 73% rename from common/cuda_hip/matrix/diagonal_kernels.hpp.inc rename to common/cuda_hip/matrix/diagonal_kernels.cpp index c3919fda079..a824abc6f7c 100644 --- a/common/cuda_hip/matrix/diagonal_kernels.hpp.inc +++ b/common/cuda_hip/matrix/diagonal_kernels.cpp @@ -2,6 +2,32 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/matrix/diagonal_kernels.hpp" + +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Diagonal matrix format namespace. + * + * @ingroup diagonal + */ +namespace diagonal { + + +constexpr int default_block_size = 512; + + namespace kernel { @@ -57,3 +83,9 @@ void apply_to_csr(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL); + + +} // namespace diagonal +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/cuda/matrix/ell_kernels.cu b/common/cuda_hip/matrix/ell_kernels.cpp similarity index 61% rename from cuda/matrix/ell_kernels.cu rename to common/cuda_hip/matrix/ell_kernels.cpp index 5c81fa7c994..40f174a25c7 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/common/cuda_hip/matrix/ell_kernels.cpp @@ -18,21 +18,21 @@ #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/atomic.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" namespace gko { namespace kernels { -namespace cuda { +namespace GKO_DEVICE_NAMESPACE { /** * @brief The ELL matrix format namespace. * @@ -75,7 +75,135 @@ constexpr int max_thread_per_worker = 32; using compiled_kernels = syn::value_list; -#include "common/cuda_hip/matrix/ell_kernels.hpp.inc" +namespace kernel { + + +template +__device__ void spmv_kernel( + const size_type num_rows, const int num_worker_per_row, + acc::range val, const IndexType* __restrict__ col, + const size_type stride, const size_type num_stored_elements_per_row, + acc::range b, OutputValueType* __restrict__ c, + const size_type c_stride, Closure op) +{ + using arithmetic_type = typename a_accessor::arithmetic_type; + const auto tidx = thread::get_thread_id_flat(); + const decltype(tidx) column_id = blockIdx.y; + if (num_thread_per_worker == 1) { + // Specialize the num_thread_per_worker = 1. It doesn't need the shared + // memory, __syncthreads, and atomic_add + if (tidx < num_rows) { + auto temp = zero(); + for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { + const auto ind = tidx + idx * stride; + const auto col_idx = col[ind]; + if (col_idx == invalid_index()) { + break; + } else { + temp += val(ind) * b(col_idx, column_id); + } + } + const auto c_ind = tidx * c_stride + column_id; + c[c_ind] = op(temp, c[c_ind]); + } + } else { + if (tidx < num_worker_per_row * num_rows) { + const auto idx_in_worker = threadIdx.y; + const auto x = tidx % num_rows; + const auto worker_id = tidx / num_rows; + const auto step_size = num_worker_per_row * num_thread_per_worker; + __shared__ uninitialized_array< + arithmetic_type, default_block_size / num_thread_per_worker> + storage; + if (idx_in_worker == 0) { + storage[threadIdx.x] = 0; + } + __syncthreads(); + auto temp = zero(); + for (size_type idx = + worker_id * num_thread_per_worker + idx_in_worker; + idx < num_stored_elements_per_row; idx += step_size) { + const auto ind = x + idx * stride; + const auto col_idx = col[ind]; + if (col_idx == invalid_index()) { + break; + } else { + temp += val(ind) * b(col_idx, column_id); + } + } + atomic_add(&storage[threadIdx.x], temp); + __syncthreads(); + if (idx_in_worker == 0) { + const auto c_ind = x * c_stride + column_id; + if (atomic) { + atomic_add(&(c[c_ind]), op(storage[threadIdx.x], c[c_ind])); + } else { + c[c_ind] = op(storage[threadIdx.x], c[c_ind]); + } + } + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void spmv( + const size_type num_rows, const int num_worker_per_row, + acc::range val, const IndexType* __restrict__ col, + const size_type stride, const size_type num_stored_elements_per_row, + acc::range b, OutputValueType* __restrict__ c, + const size_type c_stride) +{ + spmv_kernel( + num_rows, num_worker_per_row, val, col, stride, + num_stored_elements_per_row, b, c, c_stride, + [](const auto& x, const OutputValueType& y) { + return static_cast(x); + }); +} + + +template +__global__ __launch_bounds__(default_block_size) void spmv( + const size_type num_rows, const int num_worker_per_row, + acc::range alpha, acc::range val, + const IndexType* __restrict__ col, const size_type stride, + const size_type num_stored_elements_per_row, acc::range b, + const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c, + const size_type c_stride) +{ + using arithmetic_type = typename a_accessor::arithmetic_type; + const auto alpha_val = alpha(0); + const OutputValueType beta_val = beta[0]; + if (atomic) { + // Because the atomic operation changes the values of c during + // computation, it can not directly do alpha * a * b + beta * c + // operation. The beta * c needs to be done before calling this kernel. + // Then, this kernel only adds alpha * a * b when it uses atomic + // operation. + spmv_kernel( + num_rows, num_worker_per_row, val, col, stride, + num_stored_elements_per_row, b, c, c_stride, + [&alpha_val](const auto& x, const OutputValueType& y) { + return static_cast(alpha_val * x); + }); + } else { + spmv_kernel( + num_rows, num_worker_per_row, val, col, stride, + num_stored_elements_per_row, b, c, c_stride, + [&alpha_val, &beta_val](const auto& x, const OutputValueType& y) { + return static_cast( + alpha_val * x + static_cast(beta_val * y)); + }); + } +} + + +} // namespace kernel namespace { @@ -156,7 +284,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_abstract_spmv, abstract_spmv); template std::array compute_thread_worker_and_atomicity( - std::shared_ptr exec, + std::shared_ptr exec, const matrix::Ell* a) { int num_thread_per_worker = 1; @@ -200,7 +328,7 @@ std::array compute_thread_worker_and_atomicity( template -void spmv(std::shared_ptr exec, +void spmv(std::shared_ptr exec, const matrix::Ell* a, const matrix::Dense* b, matrix::Dense* c) @@ -232,7 +360,7 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( template -void advanced_spmv(std::shared_ptr exec, +void advanced_spmv(std::shared_ptr exec, const matrix::Dense* alpha, const matrix::Ell* a, const matrix::Dense* b, @@ -265,6 +393,6 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( } // namespace ell -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/matrix/ell_kernels.hpp.inc b/common/cuda_hip/matrix/ell_kernels.hpp.inc deleted file mode 100644 index a5fd37c1d05..00000000000 --- a/common/cuda_hip/matrix/ell_kernels.hpp.inc +++ /dev/null @@ -1,133 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -namespace kernel { - - -template -__device__ void spmv_kernel( - const size_type num_rows, const int num_worker_per_row, - acc::range val, const IndexType* __restrict__ col, - const size_type stride, const size_type num_stored_elements_per_row, - acc::range b, OutputValueType* __restrict__ c, - const size_type c_stride, Closure op) -{ - using arithmetic_type = typename a_accessor::arithmetic_type; - const auto tidx = thread::get_thread_id_flat(); - const decltype(tidx) column_id = blockIdx.y; - if (num_thread_per_worker == 1) { - // Specialize the num_thread_per_worker = 1. It doesn't need the shared - // memory, __syncthreads, and atomic_add - if (tidx < num_rows) { - auto temp = zero(); - for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { - const auto ind = tidx + idx * stride; - const auto col_idx = col[ind]; - if (col_idx == invalid_index()) { - break; - } else { - temp += val(ind) * b(col_idx, column_id); - } - } - const auto c_ind = tidx * c_stride + column_id; - c[c_ind] = op(temp, c[c_ind]); - } - } else { - if (tidx < num_worker_per_row * num_rows) { - const auto idx_in_worker = threadIdx.y; - const auto x = tidx % num_rows; - const auto worker_id = tidx / num_rows; - const auto step_size = num_worker_per_row * num_thread_per_worker; - __shared__ uninitialized_array< - arithmetic_type, default_block_size / num_thread_per_worker> - storage; - if (idx_in_worker == 0) { - storage[threadIdx.x] = 0; - } - __syncthreads(); - auto temp = zero(); - for (size_type idx = - worker_id * num_thread_per_worker + idx_in_worker; - idx < num_stored_elements_per_row; idx += step_size) { - const auto ind = x + idx * stride; - const auto col_idx = col[ind]; - if (col_idx == invalid_index()) { - break; - } else { - temp += val(ind) * b(col_idx, column_id); - } - } - atomic_add(&storage[threadIdx.x], temp); - __syncthreads(); - if (idx_in_worker == 0) { - const auto c_ind = x * c_stride + column_id; - if (atomic) { - atomic_add(&(c[c_ind]), op(storage[threadIdx.x], c[c_ind])); - } else { - c[c_ind] = op(storage[threadIdx.x], c[c_ind]); - } - } - } - } -} - - -template -__global__ __launch_bounds__(default_block_size) void spmv( - const size_type num_rows, const int num_worker_per_row, - acc::range val, const IndexType* __restrict__ col, - const size_type stride, const size_type num_stored_elements_per_row, - acc::range b, OutputValueType* __restrict__ c, - const size_type c_stride) -{ - spmv_kernel( - num_rows, num_worker_per_row, val, col, stride, - num_stored_elements_per_row, b, c, c_stride, - [](const auto& x, const OutputValueType& y) { - return static_cast(x); - }); -} - - -template -__global__ __launch_bounds__(default_block_size) void spmv( - const size_type num_rows, const int num_worker_per_row, - acc::range alpha, acc::range val, - const IndexType* __restrict__ col, const size_type stride, - const size_type num_stored_elements_per_row, acc::range b, - const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c, - const size_type c_stride) -{ - using arithmetic_type = typename a_accessor::arithmetic_type; - const auto alpha_val = alpha(0); - const OutputValueType beta_val = beta[0]; - if (atomic) { - // Because the atomic operation changes the values of c during - // computation, it can not directly do alpha * a * b + beta * c - // operation. The beta * c needs to be done before calling this kernel. - // Then, this kernel only adds alpha * a * b when it uses atomic - // operation. - spmv_kernel( - num_rows, num_worker_per_row, val, col, stride, - num_stored_elements_per_row, b, c, c_stride, - [&alpha_val](const auto& x, const OutputValueType& y) { - return static_cast(alpha_val * x); - }); - } else { - spmv_kernel( - num_rows, num_worker_per_row, val, col, stride, - num_stored_elements_per_row, b, c, c_stride, - [&alpha_val, &beta_val](const auto& x, const OutputValueType& y) { - return static_cast( - alpha_val * x + static_cast(beta_val * y)); - }); - } -} - - -} // namespace kernel diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.cpp similarity index 57% rename from common/cuda_hip/matrix/fbcsr_kernels.hpp.inc rename to common/cuda_hip/matrix/fbcsr_kernels.cpp index d801876adbc..f6276fdd056 100644 --- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/fbcsr_kernels.cpp @@ -2,6 +2,69 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/matrix/fbcsr_kernels.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/sparselib_block_bindings.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/unified/base/kernel_launch.hpp" +#include "core/base/array_access.hpp" +#include "core/base/block_sizes.hpp" +#include "core/base/device_matrix_data_kernels.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" +#include "core/matrix/csr_lookup.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + +/** + * @brief The fixed-size block compressed sparse row matrix format namespace. + * + * @ingroup fbcsr + */ +namespace fbcsr { + + +constexpr int default_block_size{512}; + + +#include "common/cuda_hip/matrix/csr_common.hpp.inc" + namespace kernel { @@ -341,3 +404,235 @@ template void extract_diagonal(std::shared_ptr exec, const matrix::Fbcsr* orig, matrix::Diagonal* diag) GKO_NOT_IMPLEMENTED; + + +namespace { + + +template +void dense_transpose(std::shared_ptr exec, + const size_type nrows, const size_type ncols, + const size_type orig_stride, const ValueType* const orig, + const size_type trans_stride, ValueType* const trans) +{ + if (nrows == 0) { + return; + } + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + { + blas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig, + orig_stride, &beta, trans, trans_stride, trans, + trans_stride); + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + + +} // namespace + + +template +void spmv(std::shared_ptr exec, + const matrix::Fbcsr* const a, + const matrix::Dense* const b, + matrix::Dense* const c) +{ + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + return; + } + if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { + // empty input: fill output with zero + dense::fill(exec, c, zero()); + return; + } + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); + const auto alpha = one(); + const auto beta = zero(); + auto descr = sparselib::create_mat_descr(); + const auto row_ptrs = a->get_const_row_ptrs(); + const auto col_idxs = a->get_const_col_idxs(); + const auto values = a->get_const_values(); + const int bs = a->get_block_size(); + const IndexType mb = a->get_num_block_rows(); + const IndexType nb = a->get_num_block_cols(); + const auto nnzb = static_cast(a->get_num_stored_blocks()); + const auto nrhs = static_cast(b->get_size()[1]); + const auto nrows = a->get_size()[0]; + const auto ncols = a->get_size()[1]; + const auto in_stride = b->get_stride(); + const auto out_stride = c->get_stride(); + if (nrhs == 1 && in_stride == 1 && out_stride == 1) { + sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, + nnzb, &alpha, descr, values, row_ptrs, col_idxs, + bs, b->get_const_values(), &beta, c->get_values()); + } else { + const auto trans_stride = nrows; + auto trans_c = array(exec, nrows * nrhs); + sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, + &alpha, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), in_stride, &beta, + trans_c.get_data(), trans_stride); + dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), + out_stride, c->get_values()); + } + sparselib::destroy(descr); + } else { + GKO_NOT_IMPLEMENTED; + } +} + + +template +void advanced_spmv(std::shared_ptr exec, + const matrix::Dense* const alpha, + const matrix::Fbcsr* const a, + const matrix::Dense* const b, + const matrix::Dense* const beta, + matrix::Dense* const c) +{ + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + return; + } + if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { + // empty input: scale output + dense::scale(exec, beta, c); + return; + } + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + const auto alphp = alpha->get_const_values(); + const auto betap = beta->get_const_values(); + auto descr = sparselib::create_mat_descr(); + const auto row_ptrs = a->get_const_row_ptrs(); + const auto col_idxs = a->get_const_col_idxs(); + const auto values = a->get_const_values(); + const int bs = a->get_block_size(); + const IndexType mb = a->get_num_block_rows(); + const IndexType nb = a->get_num_block_cols(); + const auto nnzb = static_cast(a->get_num_stored_blocks()); + const auto nrhs = static_cast(b->get_size()[1]); + const auto nrows = a->get_size()[0]; + const auto ncols = a->get_size()[1]; + const auto in_stride = b->get_stride(); + const auto out_stride = c->get_stride(); + if (nrhs == 1 && in_stride == 1 && out_stride == 1) { + sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, + nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), betap, c->get_values()); + } else { + const auto trans_stride = nrows; + auto trans_c = array(exec, nrows * nrhs); + dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(), + trans_stride, trans_c.get_data()); + sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, + alphp, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), in_stride, betap, + trans_c.get_data(), trans_stride); + dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), + out_stride, c->get_values()); + } + sparselib::destroy(descr); + } else { + GKO_NOT_IMPLEMENTED; + } +} + + +namespace { + + +template +void transpose_blocks_impl(syn::value_list, + std::shared_ptr exec, + matrix::Fbcsr* const mat) +{ + constexpr int subwarp_size = config::warp_size; + const auto nbnz = mat->get_num_stored_blocks(); + const auto numthreads = nbnz * subwarp_size; + const auto block_size = default_block_size; + const auto grid_dim = ceildiv(numthreads, block_size); + if (grid_dim > 0) { + kernel::transpose_blocks + <<get_stream()>>>( + nbnz, mat->get_values()); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks, + transpose_blocks_impl); + + +} // namespace + + +template +void transpose(const std::shared_ptr exec, + const matrix::Fbcsr* const orig, + matrix::Fbcsr* const trans) +{ +#ifdef GKO_COMPILING_CUDA + if (sparselib::is_supported::value) { + const int bs = orig->get_block_size(); + const IndexType nnzb = + static_cast(orig->get_num_stored_blocks()); + cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; + cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; + const IndexType buffer_size = sparselib::bsr_transpose_buffersize( + exec->get_sparselib_handle(), orig->get_num_block_rows(), + orig->get_num_block_cols(), nnzb, orig->get_const_values(), + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs); + array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + sparselib::bsr_transpose( + exec->get_sparselib_handle(), orig->get_num_block_rows(), + orig->get_num_block_cols(), nnzb, orig->get_const_values(), + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs, + trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(), + copyValues, idxBase, buffer); + + // transpose blocks + select_transpose_blocks( + fixedblock::compiled_kernels(), + [bs](int compiled_block_size) { return bs == compiled_block_size; }, + syn::value_list(), syn::type_list<>(), exec, trans); + } else +#endif + { + fallback_transpose(exec, orig, trans); + } +} + + +template +void conj_transpose(std::shared_ptr exec, + const matrix::Fbcsr* orig, + matrix::Fbcsr* trans) +{ + const int grid_size = + ceildiv(trans->get_num_stored_elements(), default_block_size); + transpose(exec, orig, trans); + if (grid_size > 0 && is_complex()) { + kernel:: + conjugate<<get_stream()>>>( + trans->get_num_stored_elements(), + as_device_type(trans->get_values())); + } +} + + +} // namespace fbcsr +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/matrix/sellp_kernels.hpp.inc b/common/cuda_hip/matrix/sellp_kernels.cpp similarity index 83% rename from common/cuda_hip/matrix/sellp_kernels.hpp.inc rename to common/cuda_hip/matrix/sellp_kernels.cpp index f4f0035c276..64c672b8d8d 100644 --- a/common/cuda_hip/matrix/sellp_kernels.hpp.inc +++ b/common/cuda_hip/matrix/sellp_kernels.cpp @@ -2,6 +2,37 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/matrix/sellp_kernels.hpp" + +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The SELL-P matrix format namespace. + * + * @ingroup sellp + */ +namespace sellp { + + +constexpr int default_block_size = 512; + + template __global__ __launch_bounds__(default_block_size) void spmv_kernel( size_type num_rows, size_type num_right_hand_sides, size_type b_stride, @@ -102,3 +133,9 @@ void advanced_spmv(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); + + +} // namespace sellp +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/common/cuda_hip/matrix/sparsity_csr_kernels.cpp similarity index 61% rename from hip/matrix/sparsity_csr_kernels.hip.cpp rename to common/cuda_hip/matrix/sparsity_csr_kernels.cpp index 7a7a4ba49d5..067b2749097 100644 --- a/hip/matrix/sparsity_csr_kernels.hip.cpp +++ b/common/cuda_hip/matrix/sparsity_csr_kernels.cpp @@ -11,24 +11,24 @@ #include "accessor/cuda_hip_helper.hpp" #include "accessor/reduced_row_major.hpp" #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" namespace gko { namespace kernels { -namespace hip { +namespace GKO_DEVICE_NAMESPACE { /** * @brief The Compressed sparse row matrix format namespace. * @@ -51,7 +51,114 @@ using classical_kernels = syn::value_list; #include "common/cuda_hip/matrix/csr_common.hpp.inc" -#include "common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc" + +namespace kernel { + + +template +__device__ void device_classical_spmv(const size_type num_rows, + const MatrixValueType* __restrict__ val, + const IndexType* __restrict__ col_idxs, + const IndexType* __restrict__ row_ptrs, + acc::range b, + acc::range c, + Closure scale) +{ + using arithmetic_type = typename output_accessor::arithmetic_type; + auto subwarp_tile = + group::tiled_partition(group::this_thread_block()); + const auto subrow = thread::get_subwarp_num_flat(); + const auto subid = subwarp_tile.thread_rank(); + const IndexType column_id = blockIdx.y; + const arithmetic_type value = val[0]; + auto row = thread::get_subwarp_id_flat(); + for (; row < num_rows; row += subrow) { + const auto ind_end = row_ptrs[row + 1]; + arithmetic_type temp_val = zero(); + for (auto ind = row_ptrs[row] + subid; ind < ind_end; + ind += subwarp_size) { + temp_val += value * b(col_idxs[ind], column_id); + } + auto subwarp_result = + reduce(subwarp_tile, temp_val, + [](const arithmetic_type& a, const arithmetic_type& b) { + return a + b; + }); + if (subid == 0) { + c(row, column_id) = scale(subwarp_result, c(row, column_id)); + } + } +} + + +template +__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv( + const size_type num_rows, const MatrixValueType* __restrict__ val, + const IndexType* __restrict__ col_idxs, + const IndexType* __restrict__ row_ptrs, acc::range b, + acc::range c) +{ + using type = typename output_accessor::arithmetic_type; + device_classical_spmv( + num_rows, val, col_idxs, row_ptrs, b, c, + [](const type& x, const type& y) { return x; }); +} + + +template +__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv( + const size_type num_rows, const MatrixValueType* __restrict__ alpha, + const MatrixValueType* __restrict__ val, + const IndexType* __restrict__ col_idxs, + const IndexType* __restrict__ row_ptrs, acc::range b, + const typename output_accessor::storage_type* __restrict__ beta, + acc::range c) +{ + using type = typename output_accessor::arithmetic_type; + const type alpha_val = alpha[0]; + const type beta_val = beta[0]; + device_classical_spmv( + num_rows, val, col_idxs, row_ptrs, b, c, + [&alpha_val, &beta_val](const type& x, const type& y) { + return alpha_val * x + beta_val * y; + }); +} + + +} // namespace kernel + + +template +void transpose(std::shared_ptr exec, + const matrix::SparsityCsr* orig, + matrix::SparsityCsr* trans) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL); + + +template +void fallback_sort(std::shared_ptr exec, + matrix::SparsityCsr* to_sort) +{ + const auto row_ptrs = to_sort->get_const_row_ptrs(); + const auto col_idxs = to_sort->get_col_idxs(); + const auto nnz = to_sort->get_num_nonzeros(); + const auto num_rows = to_sort->get_size()[0]; + array row_idx_array(exec, nnz); + const auto row_idxs = row_idx_array.get_data(); + components::convert_ptrs_to_idxs(exec, row_ptrs, num_rows, row_idxs); + // two sorts by integer keys hopefully enable Thrust to use cub's RadixSort + thrust::sort_by_key(thrust_policy(exec), col_idxs, col_idxs + nnz, + row_idxs); + thrust::stable_sort_by_key(thrust_policy(exec), row_idxs, row_idxs + nnz, + col_idxs); +} namespace host_kernel { @@ -60,7 +167,7 @@ namespace host_kernel { template void classical_spmv(syn::value_list, - std::shared_ptr exec, + std::shared_ptr exec, const matrix::SparsityCsr* a, const matrix::Dense* b, matrix::Dense* c, @@ -129,7 +236,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv); template -void spmv(std::shared_ptr exec, +void spmv(std::shared_ptr exec, const matrix::SparsityCsr* a, const matrix::Dense* b, matrix::Dense* c) @@ -145,7 +252,7 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( template -void advanced_spmv(std::shared_ptr exec, +void advanced_spmv(std::shared_ptr exec, const matrix::Dense* alpha, const matrix::SparsityCsr* a, const matrix::Dense* b, @@ -218,6 +325,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace sparsity_csr -} // namespace hip +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc b/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc deleted file mode 100644 index aedf9638888..00000000000 --- a/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc +++ /dev/null @@ -1,111 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -namespace kernel { - - -template -__device__ void device_classical_spmv(const size_type num_rows, - const MatrixValueType* __restrict__ val, - const IndexType* __restrict__ col_idxs, - const IndexType* __restrict__ row_ptrs, - acc::range b, - acc::range c, - Closure scale) -{ - using arithmetic_type = typename output_accessor::arithmetic_type; - auto subwarp_tile = - group::tiled_partition(group::this_thread_block()); - const auto subrow = thread::get_subwarp_num_flat(); - const auto subid = subwarp_tile.thread_rank(); - const IndexType column_id = blockIdx.y; - const arithmetic_type value = val[0]; - auto row = thread::get_subwarp_id_flat(); - for (; row < num_rows; row += subrow) { - const auto ind_end = row_ptrs[row + 1]; - arithmetic_type temp_val = zero(); - for (auto ind = row_ptrs[row] + subid; ind < ind_end; - ind += subwarp_size) { - temp_val += value * b(col_idxs[ind], column_id); - } - auto subwarp_result = - reduce(subwarp_tile, temp_val, - [](const arithmetic_type& a, const arithmetic_type& b) { - return a + b; - }); - if (subid == 0) { - c(row, column_id) = scale(subwarp_result, c(row, column_id)); - } - } -} - - -template -__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv( - const size_type num_rows, const MatrixValueType* __restrict__ val, - const IndexType* __restrict__ col_idxs, - const IndexType* __restrict__ row_ptrs, acc::range b, - acc::range c) -{ - using type = typename output_accessor::arithmetic_type; - device_classical_spmv( - num_rows, val, col_idxs, row_ptrs, b, c, - [](const type& x, const type& y) { return x; }); -} - - -template -__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv( - const size_type num_rows, const MatrixValueType* __restrict__ alpha, - const MatrixValueType* __restrict__ val, - const IndexType* __restrict__ col_idxs, - const IndexType* __restrict__ row_ptrs, acc::range b, - const typename output_accessor::storage_type* __restrict__ beta, - acc::range c) -{ - using type = typename output_accessor::arithmetic_type; - const type alpha_val = alpha[0]; - const type beta_val = beta[0]; - device_classical_spmv( - num_rows, val, col_idxs, row_ptrs, b, c, - [&alpha_val, &beta_val](const type& x, const type& y) { - return alpha_val * x + beta_val * y; - }); -} - - -} // namespace kernel - - -template -void transpose(std::shared_ptr exec, - const matrix::SparsityCsr* orig, - matrix::SparsityCsr* trans) - GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL); - - -template -void fallback_sort(std::shared_ptr exec, - matrix::SparsityCsr* to_sort) -{ - const auto row_ptrs = to_sort->get_const_row_ptrs(); - const auto col_idxs = to_sort->get_col_idxs(); - const auto nnz = to_sort->get_num_nonzeros(); - const auto num_rows = to_sort->get_size()[0]; - array row_idx_array(exec, nnz); - const auto row_idxs = row_idx_array.get_data(); - components::convert_ptrs_to_idxs(exec, row_ptrs, num_rows, row_idxs); - // two sorts by integer keys hopefully enable Thrust to use cub's RadixSort - thrust::sort_by_key(thrust_policy(exec), col_idxs, col_idxs + nnz, - row_idxs); - thrust::stable_sort_by_key(thrust_policy(exec), row_idxs, row_idxs + nnz, - col_idxs); -} diff --git a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc b/common/cuda_hip/multigrid/pgm_kernels.cpp similarity index 77% rename from common/cuda_hip/multigrid/pgm_kernels.hpp.inc rename to common/cuda_hip/multigrid/pgm_kernels.cpp index 9b2a5735c71..a2c5d608a50 100644 --- a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc +++ b/common/cuda_hip/multigrid/pgm_kernels.cpp @@ -2,6 +2,34 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/multigrid/pgm_kernels.hpp" + +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The PGM solver namespace. + * + * @ingroup pgm + */ +namespace pgm { + + template void sort_agg(std::shared_ptr exec, IndexType num, IndexType* row_idxs, IndexType* col_idxs) @@ -52,3 +80,9 @@ void compute_coarse_coo(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_PGM_COMPUTE_COARSE_COO); + + +} // namespace pgm +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/preconditioner/isai_kernels.hpp.inc b/common/cuda_hip/preconditioner/isai_kernels.cpp similarity index 95% rename from common/cuda_hip/preconditioner/isai_kernels.hpp.inc rename to common/cuda_hip/preconditioner/isai_kernels.cpp index 86d47680e0e..eda1f9a0661 100644 --- a/common/cuda_hip/preconditioner/isai_kernels.hpp.inc +++ b/common/cuda_hip/preconditioner/isai_kernels.cpp @@ -2,6 +2,42 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/preconditioner/isai_kernels.hpp" + +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/csr_builder.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Isai preconditioner namespace. + * @ref Isai + * @ingroup isai + */ +namespace isai { + + +constexpr int subwarp_size{row_size_limit}; +constexpr int subwarps_per_block{2}; +constexpr int default_block_size{subwarps_per_block * subwarp_size}; + + namespace kernel { @@ -559,3 +595,9 @@ void scatter_excess_solution(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL); + + +} // namespace isai +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc b/common/cuda_hip/preconditioner/jacobi_kernels.cpp similarity index 91% rename from common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc rename to common/cuda_hip/preconditioner/jacobi_kernels.cpp index e0d7cfef0e9..27069d2f693 100644 --- a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc +++ b/common/cuda_hip/preconditioner/jacobi_kernels.cpp @@ -2,6 +2,44 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/preconditioner/jacobi_kernels.hpp" + +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/preconditioner/jacobi_common.hpp" +#include "core/base/extended_float.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +// a total of 32/16 warps (1024 threads) +#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC +constexpr int default_num_warps = 16; +#else // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC +constexpr int default_num_warps = 32; +#endif +// with current architectures, at most 32 warps can be scheduled per SM (and +// current GPUs have at most 84 SMs) +constexpr int default_grid_size = 32 * 32 * 128; + + namespace { @@ -369,3 +407,9 @@ void convert_to_dense( GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL); + + +} // namespace jacobi +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/reorder/rcm_kernels.hpp.inc b/common/cuda_hip/reorder/rcm_kernels.cpp similarity index 95% rename from common/cuda_hip/reorder/rcm_kernels.hpp.inc rename to common/cuda_hip/reorder/rcm_kernels.cpp index 05fe3bce07e..380ef69fac8 100644 --- a/common/cuda_hip/reorder/rcm_kernels.hpp.inc +++ b/common/cuda_hip/reorder/rcm_kernels.cpp @@ -2,6 +2,46 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/reorder/rcm_kernels.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/memory.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/base/array_access.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The reordering namespace. + * + * @ingroup reorder + */ +namespace rcm { + + +constexpr int default_block_size = 512; + + template array compute_node_degrees( std::shared_ptr exec, @@ -613,3 +653,9 @@ void compute_permutation(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_RCM_COMPUTE_PERMUTATION_KERNEL); + + +} // namespace rcm +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/solver/cb_gmres_kernels.hpp.inc b/common/cuda_hip/solver/cb_gmres_kernels.cpp similarity index 50% rename from common/cuda_hip/solver/cb_gmres_kernels.hpp.inc rename to common/cuda_hip/solver/cb_gmres_kernels.cpp index 2a5a6c3f7f9..59c9812dc65 100644 --- a/common/cuda_hip/solver/cb_gmres_kernels.hpp.inc +++ b/common/cuda_hip/solver/cb_gmres_kernels.cpp @@ -2,6 +2,51 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/solver/cb_gmres_kernels.hpp" + +#include + +#include +#include +#include +#include + +#include "accessor/cuda_hip_helper.hpp" +#include "accessor/range.hpp" +#include "accessor/reduced_row_major.hpp" +#include "accessor/scaled_reduced_row_major.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/array_access.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "core/solver/cb_gmres_accessor.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The CB_GMRES solver namespace. + * + * @ingroup cb_gmres + */ +namespace cb_gmres { + + +constexpr int default_block_size = 512; +// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block +// size limit. +constexpr int default_dot_dim = 32; +constexpr int default_dot_size = default_dot_dim * default_dot_dim; + + #include "common/cuda_hip/solver/common_gmres_kernels.hpp.inc" @@ -551,3 +596,457 @@ __global__ __launch_bounds__(block_size) void calculate_Qy_kernel( before_preconditioner[global_id] = temp; } } + + +template +void zero_matrix(std::shared_ptr exec, size_type m, + size_type n, size_type stride, ValueType* array) +{ + const auto block_size = default_block_size; + const auto grid_size = ceildiv(n, block_size); + zero_matrix_kernel<<get_stream()>>>( + m, n, stride, as_device_type(array)); +} + + +template +void initialize(std::shared_ptr exec, + const matrix::Dense* b, + matrix::Dense* residual, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + array* stop_status, size_type krylov_dim) +{ + const auto num_threads = std::max(b->get_size()[0] * b->get_stride(), + krylov_dim * b->get_size()[1]); + const auto grid_dim = ceildiv(num_threads, default_block_size); + const auto block_dim = default_block_size; + constexpr auto block_size = default_block_size; + + initialize_kernel + <<get_stream()>>>( + b->get_size()[0], b->get_size()[1], krylov_dim, + as_device_type(b->get_const_values()), b->get_stride(), + as_device_type(residual->get_values()), residual->get_stride(), + as_device_type(givens_sin->get_values()), givens_sin->get_stride(), + as_device_type(givens_cos->get_values()), givens_cos->get_stride(), + as_device_type(stop_status->get_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); + + +template +void restart(std::shared_ptr exec, + const matrix::Dense* residual, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + matrix::Dense>* arnoldi_norm, + Accessor3d krylov_bases, + matrix::Dense* next_krylov_basis, + array* final_iter_nums, array& reduction_tmp, + size_type krylov_dim) +{ + constexpr bool use_scalar = + gko::cb_gmres::detail::has_3d_scaled_accessor::value; + const auto num_rows = residual->get_size()[0]; + const auto num_rhs = residual->get_size()[1]; + const auto krylov_stride = + gko::cb_gmres::helper_functions_accessor::get_stride( + krylov_bases); + const auto grid_dim_1 = + ceildiv((krylov_dim + 1) * krylov_stride[0], default_block_size); + const auto block_dim = default_block_size; + constexpr auto block_size = default_block_size; + const auto stride_arnoldi = arnoldi_norm->get_stride(); + + restart_1_kernel + <<get_stream()>>>( + residual->get_size()[0], residual->get_size()[1], krylov_dim, + acc::as_device_range(krylov_bases), + as_device_type(residual_norm_collection->get_values()), + residual_norm_collection->get_stride()); + kernels::GKO_DEVICE_NAMESPACE::dense::compute_norm2_dispatch( + exec, residual, residual_norm, reduction_tmp); + + if (use_scalar) { + components::fill_array(exec, + arnoldi_norm->get_values() + 2 * stride_arnoldi, + num_rhs, zero>()); + const dim3 grid_size_nrm(ceildiv(num_rhs, default_dot_dim), + exec->get_num_multiprocessor() * 2); + const dim3 block_size_nrm(default_dot_dim, default_dot_dim); + multinorminf_without_stop_kernel<<get_stream()>>>( + num_rows, num_rhs, as_device_type(residual->get_const_values()), + residual->get_stride(), + as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), 0); + } + + if (gko::cb_gmres::detail::has_3d_scaled_accessor::value) { + set_scalar_kernel + <<get_stream()>>>( + num_rhs, krylov_dim + 1, + as_device_type(residual_norm->get_const_values()), + residual_norm->get_stride(), + as_device_type(arnoldi_norm->get_const_values() + + 2 * stride_arnoldi), + stride_arnoldi, acc::as_device_range(krylov_bases)); + } + + const auto grid_dim_2 = + ceildiv(std::max(num_rows, 1) * krylov_stride[1], + default_block_size); + restart_2_kernel + <<get_stream()>>>( + residual->get_size()[0], residual->get_size()[1], + as_device_type(residual->get_const_values()), + residual->get_stride(), + as_device_type(residual_norm->get_const_values()), + as_device_type(residual_norm_collection->get_values()), + acc::as_device_range(krylov_bases), + as_device_type(next_krylov_basis->get_values()), + next_krylov_basis->get_stride(), + as_device_type(final_iter_nums->get_data())); +} + +GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL); + + +template +void finish_arnoldi_CGS(std::shared_ptr exec, + matrix::Dense* next_krylov_basis, + Accessor3dim krylov_bases, + matrix::Dense* hessenberg_iter, + matrix::Dense* buffer_iter, + matrix::Dense>* arnoldi_norm, + size_type iter, const stopping_status* stop_status, + stopping_status* reorth_status, + array* num_reorth) +{ + const auto dim_size = next_krylov_basis->get_size(); + if (dim_size[1] == 0) { + return; + } + using non_complex = remove_complex; + // optimization parameter + constexpr int singledot_block_size = default_dot_dim; + constexpr bool use_scalar = + gko::cb_gmres::detail::has_3d_scaled_accessor::value; + const auto stride_next_krylov = next_krylov_basis->get_stride(); + const auto stride_hessenberg = hessenberg_iter->get_stride(); + const auto stride_buffer = buffer_iter->get_stride(); + const auto stride_arnoldi = arnoldi_norm->get_stride(); + const dim3 grid_size(ceildiv(dim_size[1], default_dot_dim), + exec->get_num_multiprocessor() * 2); + const dim3 grid_size_num_iters(ceildiv(dim_size[1], default_dot_dim), + exec->get_num_multiprocessor() * 2, + iter + 1); + const dim3 block_size(default_dot_dim, default_dot_dim); + // Note: having iter first (instead of row_idx information) is likely + // beneficial for avoiding atomic_add conflicts, but that needs + // further investigation. + const dim3 grid_size_iters_single(exec->get_num_multiprocessor() * 2, + iter + 1); + const auto block_size_iters_single = singledot_block_size; + size_type num_reorth_host; + + components::fill_array(exec, arnoldi_norm->get_values(), dim_size[1], + zero()); + multinorm2_kernel<<get_stream()>>>( + dim_size[0], dim_size[1], + as_device_type(next_krylov_basis->get_const_values()), + stride_next_krylov, as_device_type(arnoldi_norm->get_values()), + as_device_type(stop_status)); + // nrmP = norm(next_krylov_basis) + zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg, + hessenberg_iter->get_values()); + if (dim_size[1] > 1) { + multidot_kernel + <<get_stream()>>>( + dim_size[0], dim_size[1], + as_device_type(next_krylov_basis->get_const_values()), + stride_next_krylov, acc::as_device_range(krylov_bases), + as_device_type(hessenberg_iter->get_values()), + stride_hessenberg, as_device_type(stop_status)); + } else { + singledot_kernel + <<get_stream()>>>( + dim_size[0], + as_device_type(next_krylov_basis->get_const_values()), + stride_next_krylov, acc::as_device_range(krylov_bases), + as_device_type(hessenberg_iter->get_values()), + stride_hessenberg, as_device_type(stop_status)); + } + // for i in 1:iter + // hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i) + // end + update_next_krylov_kernel + <<get_stream()>>>( + iter + 1, dim_size[0], dim_size[1], + as_device_type(next_krylov_basis->get_values()), stride_next_krylov, + acc::as_device_range(krylov_bases), + as_device_type(hessenberg_iter->get_const_values()), + stride_hessenberg, as_device_type(stop_status)); + + // for i in 1:iter + // next_krylov_basis -= hessenberg(iter, i) * krylov_bases(:, i) + // end + components::fill_array(exec, arnoldi_norm->get_values() + stride_arnoldi, + dim_size[1], zero()); + if (use_scalar) { + components::fill_array(exec, + arnoldi_norm->get_values() + 2 * stride_arnoldi, + dim_size[1], zero()); + } + multinorm2_inf_kernel + <<get_stream()>>>( + dim_size[0], dim_size[1], + as_device_type(next_krylov_basis->get_const_values()), + stride_next_krylov, + as_device_type(arnoldi_norm->get_values() + stride_arnoldi), + as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), + as_device_type(stop_status)); + // nrmN = norm(next_krylov_basis) + components::fill_array(exec, num_reorth->get_data(), 1, zero()); + check_arnoldi_norms + <<get_stream()>>>( + dim_size[1], as_device_type(arnoldi_norm->get_values()), + stride_arnoldi, as_device_type(hessenberg_iter->get_values()), + stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases), + as_device_type(stop_status), as_device_type(reorth_status), + as_device_type(num_reorth->get_data())); + num_reorth_host = get_element(*num_reorth, 0); + // num_reorth_host := number of next_krylov vector to be reorthogonalization + for (size_type l = 1; (num_reorth_host > 0) && (l < 3); l++) { + zero_matrix(exec, iter + 1, dim_size[1], stride_buffer, + buffer_iter->get_values()); + if (dim_size[1] > 1) { + multidot_kernel + <<get_stream()>>>( + dim_size[0], dim_size[1], + as_device_type(next_krylov_basis->get_const_values()), + stride_next_krylov, acc::as_device_range(krylov_bases), + as_device_type(buffer_iter->get_values()), stride_buffer, + as_device_type(stop_status)); + } else { + singledot_kernel + <<get_stream()>>>( + dim_size[0], + as_device_type(next_krylov_basis->get_const_values()), + stride_next_krylov, acc::as_device_range(krylov_bases), + as_device_type(buffer_iter->get_values()), stride_buffer, + as_device_type(stop_status)); + } + // for i in 1:iter + // hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i) + // end + update_next_krylov_and_add_kernel + <<get_stream()>>>( + iter + 1, dim_size[0], dim_size[1], + as_device_type(next_krylov_basis->get_values()), + stride_next_krylov, acc::as_device_range(krylov_bases), + as_device_type(hessenberg_iter->get_values()), + stride_hessenberg, + as_device_type(buffer_iter->get_const_values()), stride_buffer, + as_device_type(stop_status), as_device_type(reorth_status)); + // for i in 1:iter + // next_krylov_basis -= hessenberg(iter, i) * krylov_bases(:, i) + // end + components::fill_array(exec, + arnoldi_norm->get_values() + stride_arnoldi, + dim_size[1], zero()); + if (use_scalar) { + components::fill_array( + exec, arnoldi_norm->get_values() + 2 * stride_arnoldi, + dim_size[1], zero()); + } + multinorm2_inf_kernel + <<get_stream()>>>( + dim_size[0], dim_size[1], + as_device_type(next_krylov_basis->get_const_values()), + stride_next_krylov, + as_device_type(arnoldi_norm->get_values() + stride_arnoldi), + as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), + as_device_type(stop_status)); + // nrmN = norm(next_krylov_basis) + components::fill_array(exec, num_reorth->get_data(), 1, + zero()); + check_arnoldi_norms + <<get_stream()>>>( + dim_size[1], as_device_type(arnoldi_norm->get_values()), + stride_arnoldi, as_device_type(hessenberg_iter->get_values()), + stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases), + as_device_type(stop_status), as_device_type(reorth_status), + num_reorth->get_data()); + num_reorth_host = get_element(*num_reorth, 0); + // num_reorth_host := number of next_krylov vector to be + // reorthogonalization + } + update_krylov_next_krylov_kernel + <<get_stream()>>>( + iter, dim_size[0], dim_size[1], + as_device_type(next_krylov_basis->get_values()), stride_next_krylov, + acc::as_device_range(krylov_bases), + as_device_type(hessenberg_iter->get_const_values()), + stride_hessenberg, as_device_type(stop_status)); + // next_krylov_basis /= hessenberg(iter, iter + 1) + // krylov_bases(:, iter + 1) = next_krylov_basis + // End of arnoldi +} + +template +void givens_rotation(std::shared_ptr exec, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + matrix::Dense* hessenberg_iter, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + size_type iter, const array* stop_status) +{ + // TODO: tune block_size for optimal performance + constexpr auto block_size = default_block_size; + const auto num_cols = hessenberg_iter->get_size()[1]; + const auto block_dim = block_size; + const auto grid_dim = + static_cast(ceildiv(num_cols, block_size)); + + givens_rotation_kernel + <<get_stream()>>>( + hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1], + iter, as_device_type(hessenberg_iter->get_values()), + hessenberg_iter->get_stride(), + as_device_type(givens_sin->get_values()), givens_sin->get_stride(), + as_device_type(givens_cos->get_values()), givens_cos->get_stride(), + as_device_type(residual_norm->get_values()), + as_device_type(residual_norm_collection->get_values()), + residual_norm_collection->get_stride(), + stop_status->get_const_data()); +} + + +template +void arnoldi(std::shared_ptr exec, + matrix::Dense* next_krylov_basis, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + Accessor3d krylov_bases, matrix::Dense* hessenberg_iter, + matrix::Dense* buffer_iter, + matrix::Dense>* arnoldi_norm, + size_type iter, array* final_iter_nums, + const array* stop_status, + array* reorth_status, + array* num_reorth) +{ + increase_final_iteration_numbers_kernel<<< + static_cast( + ceildiv(final_iter_nums->get_size(), default_block_size)), + default_block_size, 0, exec->get_stream()>>>( + as_device_type(final_iter_nums->get_data()), + stop_status->get_const_data(), final_iter_nums->get_size()); + finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter, + buffer_iter, arnoldi_norm, iter, + stop_status->get_const_data(), reorth_status->get_data(), + num_reorth); + givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter, + residual_norm, residual_norm_collection, iter, stop_status); +} + +GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL); + + +template +void solve_upper_triangular( + std::shared_ptr exec, + const matrix::Dense* residual_norm_collection, + const matrix::Dense* hessenberg, matrix::Dense* y, + const array* final_iter_nums) +{ + // TODO: tune block_size for optimal performance + constexpr auto block_size = default_block_size; + const auto num_rhs = residual_norm_collection->get_size()[1]; + const auto block_dim = block_size; + const auto grid_dim = + static_cast(ceildiv(num_rhs, block_size)); + + solve_upper_triangular_kernel + <<get_stream()>>>( + hessenberg->get_size()[1], num_rhs, + as_device_type(residual_norm_collection->get_const_values()), + residual_norm_collection->get_stride(), + as_device_type(hessenberg->get_const_values()), + hessenberg->get_stride(), as_device_type(y->get_values()), + y->get_stride(), as_device_type(final_iter_nums->get_const_data())); +} + + +template +void calculate_qy(std::shared_ptr exec, + ConstAccessor3d krylov_bases, size_type num_krylov_bases, + const matrix::Dense* y, + matrix::Dense* before_preconditioner, + const array* final_iter_nums) +{ + const auto num_rows = before_preconditioner->get_size()[0]; + const auto num_cols = before_preconditioner->get_size()[1]; + const auto stride_before_preconditioner = + before_preconditioner->get_stride(); + + constexpr auto block_size = default_block_size; + const auto grid_dim = static_cast( + ceildiv(num_rows * stride_before_preconditioner, block_size)); + const auto block_dim = block_size; + + calculate_Qy_kernel + <<get_stream()>>>( + num_rows, num_cols, acc::as_device_range(krylov_bases), + as_device_type(y->get_const_values()), y->get_stride(), + as_device_type(before_preconditioner->get_values()), + stride_before_preconditioner, + as_device_type(final_iter_nums->get_const_data())); + // Calculate qy + // before_preconditioner = krylov_bases * y +} + + +template +void solve_krylov(std::shared_ptr exec, + const matrix::Dense* residual_norm_collection, + ConstAccessor3d krylov_bases, + const matrix::Dense* hessenberg, + matrix::Dense* y, + matrix::Dense* before_preconditioner, + const array* final_iter_nums) +{ + if (before_preconditioner->get_size()[1] == 0) { + return; + } + // since hessenberg has dims: iters x iters * num_rhs + // krylov_bases has dims: (iters + 1) x sysmtx[0] x num_rhs + const auto iters = + hessenberg->get_size()[1] / before_preconditioner->get_size()[1]; + const auto num_krylov_bases = iters + 1; + solve_upper_triangular(exec, residual_norm_collection, hessenberg, y, + final_iter_nums); + calculate_qy(exec, krylov_bases, num_krylov_bases, y, before_preconditioner, + final_iter_nums); +} + +GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE( + GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL); + + +} // namespace cb_gmres +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/cuda/solver/idr_kernels.cu b/common/cuda_hip/solver/idr_kernels.cpp similarity index 52% rename from cuda/solver/idr_kernels.cu rename to common/cuda_hip/solver/idr_kernels.cpp index 34aac3751d6..63c5f015f68 100644 --- a/cuda/solver/idr_kernels.cu +++ b/common/cuda_hip/solver/idr_kernels.cpp @@ -12,20 +12,20 @@ #include "common/cuda_hip/base/blas_bindings.hpp" #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/randlib_bindings.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/atomic.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/fill_array_kernels.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" namespace gko { namespace kernels { -namespace cuda { +namespace GKO_DEVICE_NAMESPACE { /** * @brief The IDR solver namespace. * @@ -39,7 +39,320 @@ constexpr int default_dot_dim = 32; constexpr int default_dot_size = default_dot_dim * default_dot_dim; -#include "common/cuda_hip/solver/idr_kernels.hpp.inc" +template +__global__ __launch_bounds__(default_block_size) void initialize_m_kernel( + size_type subspace_dim, size_type nrhs, ValueType* __restrict__ m_values, + size_type m_stride, stopping_status* __restrict__ stop_status) +{ + const auto global_id = thread::get_thread_id_flat(); + const auto row = global_id / m_stride; + const auto col = global_id % m_stride; + + if (global_id < nrhs) { + stop_status[global_id].reset(); + } + + if (row < subspace_dim && col < nrhs * subspace_dim) { + m_values[row * m_stride + col] = + (row == col / nrhs) ? one() : zero(); + } +} + + +template +__global__ +__launch_bounds__(block_size) void orthonormalize_subspace_vectors_kernel( + size_type num_rows, size_type num_cols, ValueType* __restrict__ values, + size_type stride) +{ + const auto tidx = thread::get_thread_id_flat(); + + __shared__ uninitialized_array + reduction_helper_array; + // they are not be used in the same time. + ValueType* reduction_helper = reduction_helper_array; + auto reduction_helper_real = + reinterpret_cast*>(reduction_helper); + + for (size_type row = 0; row < num_rows; row++) { + for (size_type i = 0; i < row; i++) { + auto dot = zero(); + for (size_type j = tidx; j < num_cols; j += block_size) { + dot += values[row * stride + j] * conj(values[i * stride + j]); + } + + // Ensure already finish reading this shared memory + __syncthreads(); + reduction_helper[tidx] = dot; + reduce( + group::this_thread_block(), reduction_helper, + [](const ValueType& a, const ValueType& b) { return a + b; }); + __syncthreads(); + + dot = reduction_helper[0]; + for (size_type j = tidx; j < num_cols; j += block_size) { + values[row * stride + j] -= dot * values[i * stride + j]; + } + } + + auto norm = zero>(); + for (size_type j = tidx; j < num_cols; j += block_size) { + norm += squared_norm(values[row * stride + j]); + } + // Ensure already finish reading this shared memory + __syncthreads(); + reduction_helper_real[tidx] = norm; + reduce(group::this_thread_block(), reduction_helper_real, + [](const remove_complex& a, + const remove_complex& b) { return a + b; }); + __syncthreads(); + + norm = sqrt(reduction_helper_real[0]); + for (size_type j = tidx; j < num_cols; j += block_size) { + values[row * stride + j] /= norm; + } + } +} + + +template +__global__ +__launch_bounds__(default_block_size) void solve_lower_triangular_kernel( + size_type subspace_dim, size_type nrhs, + const ValueType* __restrict__ m_values, size_type m_stride, + const ValueType* __restrict__ f_values, size_type f_stride, + ValueType* __restrict__ c_values, size_type c_stride, + const stopping_status* __restrict__ stop_status) +{ + const auto global_id = thread::get_thread_id_flat(); + + if (global_id >= nrhs) { + return; + } + + if (!stop_status[global_id].has_stopped()) { + for (size_type row = 0; row < subspace_dim; row++) { + auto temp = f_values[row * f_stride + global_id]; + for (size_type col = 0; col < row; col++) { + temp -= m_values[row * m_stride + col * nrhs + global_id] * + c_values[col * c_stride + global_id]; + } + c_values[row * c_stride + global_id] = + temp / m_values[row * m_stride + row * nrhs + global_id]; + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void step_1_kernel( + size_type k, size_type num_rows, size_type subspace_dim, size_type nrhs, + const ValueType* __restrict__ residual_values, size_type residual_stride, + const ValueType* __restrict__ c_values, size_type c_stride, + const ValueType* __restrict__ g_values, size_type g_stride, + ValueType* __restrict__ v_values, size_type v_stride, + const stopping_status* __restrict__ stop_status) +{ + const auto global_id = thread::get_thread_id_flat(); + const auto row = global_id / nrhs; + const auto col = global_id % nrhs; + + if (row >= num_rows) { + return; + } + + if (!stop_status[col].has_stopped()) { + auto temp = residual_values[row * residual_stride + col]; + for (size_type j = k; j < subspace_dim; j++) { + temp -= c_values[j * c_stride + col] * + g_values[row * g_stride + j * nrhs + col]; + } + v_values[row * v_stride + col] = temp; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void step_2_kernel( + size_type k, size_type num_rows, size_type subspace_dim, size_type nrhs, + const ValueType* __restrict__ omega_values, + const ValueType* __restrict__ v_values, size_type v_stride, + const ValueType* __restrict__ c_values, size_type c_stride, + ValueType* __restrict__ u_values, size_type u_stride, + const stopping_status* __restrict__ stop_status) +{ + const auto global_id = thread::get_thread_id_flat(); + const auto row = global_id / nrhs; + const auto col = global_id % nrhs; + + if (row >= num_rows) { + return; + } + + if (!stop_status[col].has_stopped()) { + auto temp = omega_values[col] * v_values[row * v_stride + col]; + for (size_type j = k; j < subspace_dim; j++) { + temp += c_values[j * c_stride + col] * + u_values[row * u_stride + j * nrhs + col]; + } + u_values[row * u_stride + k * nrhs + col] = temp; + } +} + + +template +__global__ __launch_bounds__(default_dot_size) void multidot_kernel( + size_type num_rows, size_type nrhs, const ValueType* __restrict__ p_i, + const ValueType* __restrict__ g_k, size_type g_k_stride, + ValueType* __restrict__ alpha, + const stopping_status* __restrict__ stop_status) +{ + const auto tidx = threadIdx.x; + const auto tidy = threadIdx.y; + const auto rhs = blockIdx.x * default_dot_dim + tidx; + const auto num = ceildiv(num_rows, gridDim.y); + const auto start_row = blockIdx.y * num; + const auto end_row = + ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num; + // Used that way to get around dynamic initialization warning and + // template error when using `reduction_helper_array` directly in `reduce` + __shared__ + uninitialized_array + reduction_helper_array; + ValueType* __restrict__ reduction_helper = reduction_helper_array; + + ValueType local_res = zero(); + if (rhs < nrhs && !stop_status[rhs].has_stopped()) { + for (size_type i = start_row + tidy; i < end_row; + i += default_dot_dim) { + const auto g_idx = i * g_k_stride + rhs; + local_res += p_i[i] * g_k[g_idx]; + } + } + reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res; + __syncthreads(); + local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx]; + const auto tile_block = + group::tiled_partition(group::this_thread_block()); + const auto sum = + reduce(tile_block, local_res, + [](const ValueType& a, const ValueType& b) { return a + b; }); + const auto new_rhs = blockIdx.x * default_dot_dim + tidy; + if (tidx == 0 && new_rhs < nrhs && !stop_status[new_rhs].has_stopped()) { + atomic_add(alpha + new_rhs, sum); + } +} + + +template +__global__ __launch_bounds__(block_size) void update_g_k_and_u_kernel( + size_type k, size_type i, size_type size, size_type nrhs, + const ValueType* __restrict__ alpha, const ValueType* __restrict__ m_values, + size_type m_stride, const ValueType* __restrict__ g_values, + size_type g_stride, ValueType* __restrict__ g_k_values, + size_type g_k_stride, ValueType* __restrict__ u_values, size_type u_stride, + const stopping_status* __restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto row = tidx / g_k_stride; + const auto rhs = tidx % g_k_stride; + + if (row >= size || rhs >= nrhs) { + return; + } + + if (!stop_status[rhs].has_stopped()) { + const auto fact = alpha[rhs] / m_values[i * m_stride + i * nrhs + rhs]; + g_k_values[row * g_k_stride + rhs] -= + fact * g_values[row * g_stride + i * nrhs + rhs]; + u_values[row * u_stride + k * nrhs + rhs] -= + fact * u_values[row * u_stride + i * nrhs + rhs]; + } +} + + +template +__global__ __launch_bounds__(block_size) void update_g_kernel( + size_type k, size_type size, size_type nrhs, + const ValueType* __restrict__ g_k_values, size_type g_k_stride, + ValueType* __restrict__ g_values, size_type g_stride, + const stopping_status* __restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto row = tidx / g_k_stride; + const auto rhs = tidx % nrhs; + + if (row >= size || rhs >= nrhs) { + return; + } + + if (!stop_status[rhs].has_stopped()) { + g_values[row * g_stride + k * nrhs + rhs] = + g_k_values[row * g_k_stride + rhs]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void update_x_r_and_f_kernel( + size_type k, size_type size, size_type subspace_dim, size_type nrhs, + const ValueType* __restrict__ m_values, size_type m_stride, + const ValueType* __restrict__ g_values, size_type g_stride, + const ValueType* __restrict__ u_values, size_type u_stride, + ValueType* __restrict__ f_values, size_type f_stride, + ValueType* __restrict__ r_values, size_type r_stride, + ValueType* __restrict__ x_values, size_type x_stride, + const stopping_status* __restrict__ stop_status) +{ + const auto global_id = thread::get_thread_id_flat(); + const auto row = global_id / x_stride; + const auto col = global_id % x_stride; + + if (row >= size || col >= nrhs) { + return; + } + + if (!stop_status[col].has_stopped()) { + const auto beta = f_values[k * f_stride + col] / + m_values[k * m_stride + k * nrhs + col]; + r_values[row * r_stride + col] -= + beta * g_values[row * g_stride + k * nrhs + col]; + x_values[row * x_stride + col] += + beta * u_values[row * u_stride + k * nrhs + col]; + + if (k < row && k + 1 < subspace_dim && row < subspace_dim) { + f_values[row * f_stride + col] -= + beta * m_values[row * m_stride + k * nrhs + col]; + } + } +} + + +template +__global__ __launch_bounds__(config::warp_size) void compute_omega_kernel( + size_type nrhs, const remove_complex kappa, + const ValueType* __restrict__ tht, + const remove_complex* __restrict__ residual_norm, + ValueType* __restrict__ omega, + const stopping_status* __restrict__ stop_status) +{ + const auto global_id = thread::get_thread_id_flat(); + + if (global_id >= nrhs) { + return; + } + + if (!stop_status[global_id].has_stopped()) { + auto thr = omega[global_id]; + omega[global_id] /= tht[global_id]; + auto absrho = + abs(thr / (sqrt(real(tht[global_id])) * residual_norm[global_id])); + + if (absrho < kappa) { + omega[global_id] *= kappa / absrho; + } + } +} namespace { @@ -335,6 +648,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); } // namespace idr -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/solver/idr_kernels.hpp.inc b/common/cuda_hip/solver/idr_kernels.hpp.inc deleted file mode 100644 index 465417a6edb..00000000000 --- a/common/cuda_hip/solver/idr_kernels.hpp.inc +++ /dev/null @@ -1,318 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -template -__global__ __launch_bounds__(default_block_size) void initialize_m_kernel( - size_type subspace_dim, size_type nrhs, ValueType* __restrict__ m_values, - size_type m_stride, stopping_status* __restrict__ stop_status) -{ - const auto global_id = thread::get_thread_id_flat(); - const auto row = global_id / m_stride; - const auto col = global_id % m_stride; - - if (global_id < nrhs) { - stop_status[global_id].reset(); - } - - if (row < subspace_dim && col < nrhs * subspace_dim) { - m_values[row * m_stride + col] = - (row == col / nrhs) ? one() : zero(); - } -} - - -template -__global__ -__launch_bounds__(block_size) void orthonormalize_subspace_vectors_kernel( - size_type num_rows, size_type num_cols, ValueType* __restrict__ values, - size_type stride) -{ - const auto tidx = thread::get_thread_id_flat(); - - __shared__ uninitialized_array - reduction_helper_array; - // they are not be used in the same time. - ValueType* reduction_helper = reduction_helper_array; - auto reduction_helper_real = - reinterpret_cast*>(reduction_helper); - - for (size_type row = 0; row < num_rows; row++) { - for (size_type i = 0; i < row; i++) { - auto dot = zero(); - for (size_type j = tidx; j < num_cols; j += block_size) { - dot += values[row * stride + j] * conj(values[i * stride + j]); - } - - // Ensure already finish reading this shared memory - __syncthreads(); - reduction_helper[tidx] = dot; - reduce( - group::this_thread_block(), reduction_helper, - [](const ValueType& a, const ValueType& b) { return a + b; }); - __syncthreads(); - - dot = reduction_helper[0]; - for (size_type j = tidx; j < num_cols; j += block_size) { - values[row * stride + j] -= dot * values[i * stride + j]; - } - } - - auto norm = zero>(); - for (size_type j = tidx; j < num_cols; j += block_size) { - norm += squared_norm(values[row * stride + j]); - } - // Ensure already finish reading this shared memory - __syncthreads(); - reduction_helper_real[tidx] = norm; - reduce(group::this_thread_block(), reduction_helper_real, - [](const remove_complex& a, - const remove_complex& b) { return a + b; }); - __syncthreads(); - - norm = sqrt(reduction_helper_real[0]); - for (size_type j = tidx; j < num_cols; j += block_size) { - values[row * stride + j] /= norm; - } - } -} - - -template -__global__ -__launch_bounds__(default_block_size) void solve_lower_triangular_kernel( - size_type subspace_dim, size_type nrhs, - const ValueType* __restrict__ m_values, size_type m_stride, - const ValueType* __restrict__ f_values, size_type f_stride, - ValueType* __restrict__ c_values, size_type c_stride, - const stopping_status* __restrict__ stop_status) -{ - const auto global_id = thread::get_thread_id_flat(); - - if (global_id >= nrhs) { - return; - } - - if (!stop_status[global_id].has_stopped()) { - for (size_type row = 0; row < subspace_dim; row++) { - auto temp = f_values[row * f_stride + global_id]; - for (size_type col = 0; col < row; col++) { - temp -= m_values[row * m_stride + col * nrhs + global_id] * - c_values[col * c_stride + global_id]; - } - c_values[row * c_stride + global_id] = - temp / m_values[row * m_stride + row * nrhs + global_id]; - } - } -} - - -template -__global__ __launch_bounds__(default_block_size) void step_1_kernel( - size_type k, size_type num_rows, size_type subspace_dim, size_type nrhs, - const ValueType* __restrict__ residual_values, size_type residual_stride, - const ValueType* __restrict__ c_values, size_type c_stride, - const ValueType* __restrict__ g_values, size_type g_stride, - ValueType* __restrict__ v_values, size_type v_stride, - const stopping_status* __restrict__ stop_status) -{ - const auto global_id = thread::get_thread_id_flat(); - const auto row = global_id / nrhs; - const auto col = global_id % nrhs; - - if (row >= num_rows) { - return; - } - - if (!stop_status[col].has_stopped()) { - auto temp = residual_values[row * residual_stride + col]; - for (size_type j = k; j < subspace_dim; j++) { - temp -= c_values[j * c_stride + col] * - g_values[row * g_stride + j * nrhs + col]; - } - v_values[row * v_stride + col] = temp; - } -} - - -template -__global__ __launch_bounds__(default_block_size) void step_2_kernel( - size_type k, size_type num_rows, size_type subspace_dim, size_type nrhs, - const ValueType* __restrict__ omega_values, - const ValueType* __restrict__ v_values, size_type v_stride, - const ValueType* __restrict__ c_values, size_type c_stride, - ValueType* __restrict__ u_values, size_type u_stride, - const stopping_status* __restrict__ stop_status) -{ - const auto global_id = thread::get_thread_id_flat(); - const auto row = global_id / nrhs; - const auto col = global_id % nrhs; - - if (row >= num_rows) { - return; - } - - if (!stop_status[col].has_stopped()) { - auto temp = omega_values[col] * v_values[row * v_stride + col]; - for (size_type j = k; j < subspace_dim; j++) { - temp += c_values[j * c_stride + col] * - u_values[row * u_stride + j * nrhs + col]; - } - u_values[row * u_stride + k * nrhs + col] = temp; - } -} - - -template -__global__ __launch_bounds__(default_dot_size) void multidot_kernel( - size_type num_rows, size_type nrhs, const ValueType* __restrict__ p_i, - const ValueType* __restrict__ g_k, size_type g_k_stride, - ValueType* __restrict__ alpha, - const stopping_status* __restrict__ stop_status) -{ - const auto tidx = threadIdx.x; - const auto tidy = threadIdx.y; - const auto rhs = blockIdx.x * default_dot_dim + tidx; - const auto num = ceildiv(num_rows, gridDim.y); - const auto start_row = blockIdx.y * num; - const auto end_row = - ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num; - // Used that way to get around dynamic initialization warning and - // template error when using `reduction_helper_array` directly in `reduce` - __shared__ - uninitialized_array - reduction_helper_array; - ValueType* __restrict__ reduction_helper = reduction_helper_array; - - ValueType local_res = zero(); - if (rhs < nrhs && !stop_status[rhs].has_stopped()) { - for (size_type i = start_row + tidy; i < end_row; - i += default_dot_dim) { - const auto g_idx = i * g_k_stride + rhs; - local_res += p_i[i] * g_k[g_idx]; - } - } - reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res; - __syncthreads(); - local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx]; - const auto tile_block = - group::tiled_partition(group::this_thread_block()); - const auto sum = - reduce(tile_block, local_res, - [](const ValueType& a, const ValueType& b) { return a + b; }); - const auto new_rhs = blockIdx.x * default_dot_dim + tidy; - if (tidx == 0 && new_rhs < nrhs && !stop_status[new_rhs].has_stopped()) { - atomic_add(alpha + new_rhs, sum); - } -} - - -template -__global__ __launch_bounds__(block_size) void update_g_k_and_u_kernel( - size_type k, size_type i, size_type size, size_type nrhs, - const ValueType* __restrict__ alpha, const ValueType* __restrict__ m_values, - size_type m_stride, const ValueType* __restrict__ g_values, - size_type g_stride, ValueType* __restrict__ g_k_values, - size_type g_k_stride, ValueType* __restrict__ u_values, size_type u_stride, - const stopping_status* __restrict__ stop_status) -{ - const auto tidx = thread::get_thread_id_flat(); - const auto row = tidx / g_k_stride; - const auto rhs = tidx % g_k_stride; - - if (row >= size || rhs >= nrhs) { - return; - } - - if (!stop_status[rhs].has_stopped()) { - const auto fact = alpha[rhs] / m_values[i * m_stride + i * nrhs + rhs]; - g_k_values[row * g_k_stride + rhs] -= - fact * g_values[row * g_stride + i * nrhs + rhs]; - u_values[row * u_stride + k * nrhs + rhs] -= - fact * u_values[row * u_stride + i * nrhs + rhs]; - } -} - - -template -__global__ __launch_bounds__(block_size) void update_g_kernel( - size_type k, size_type size, size_type nrhs, - const ValueType* __restrict__ g_k_values, size_type g_k_stride, - ValueType* __restrict__ g_values, size_type g_stride, - const stopping_status* __restrict__ stop_status) -{ - const auto tidx = thread::get_thread_id_flat(); - const auto row = tidx / g_k_stride; - const auto rhs = tidx % nrhs; - - if (row >= size || rhs >= nrhs) { - return; - } - - if (!stop_status[rhs].has_stopped()) { - g_values[row * g_stride + k * nrhs + rhs] = - g_k_values[row * g_k_stride + rhs]; - } -} - - -template -__global__ __launch_bounds__(default_block_size) void update_x_r_and_f_kernel( - size_type k, size_type size, size_type subspace_dim, size_type nrhs, - const ValueType* __restrict__ m_values, size_type m_stride, - const ValueType* __restrict__ g_values, size_type g_stride, - const ValueType* __restrict__ u_values, size_type u_stride, - ValueType* __restrict__ f_values, size_type f_stride, - ValueType* __restrict__ r_values, size_type r_stride, - ValueType* __restrict__ x_values, size_type x_stride, - const stopping_status* __restrict__ stop_status) -{ - const auto global_id = thread::get_thread_id_flat(); - const auto row = global_id / x_stride; - const auto col = global_id % x_stride; - - if (row >= size || col >= nrhs) { - return; - } - - if (!stop_status[col].has_stopped()) { - const auto beta = f_values[k * f_stride + col] / - m_values[k * m_stride + k * nrhs + col]; - r_values[row * r_stride + col] -= - beta * g_values[row * g_stride + k * nrhs + col]; - x_values[row * x_stride + col] += - beta * u_values[row * u_stride + k * nrhs + col]; - - if (k < row && k + 1 < subspace_dim && row < subspace_dim) { - f_values[row * f_stride + col] -= - beta * m_values[row * m_stride + k * nrhs + col]; - } - } -} - - -template -__global__ __launch_bounds__(config::warp_size) void compute_omega_kernel( - size_type nrhs, const remove_complex kappa, - const ValueType* __restrict__ tht, - const remove_complex* __restrict__ residual_norm, - ValueType* __restrict__ omega, - const stopping_status* __restrict__ stop_status) -{ - const auto global_id = thread::get_thread_id_flat(); - - if (global_id >= nrhs) { - return; - } - - if (!stop_status[global_id].has_stopped()) { - auto thr = omega[global_id]; - omega[global_id] /= tht[global_id]; - auto absrho = - abs(thr / (sqrt(real(tht[global_id])) * residual_norm[global_id])); - - if (absrho < kappa) { - omega[global_id] *= kappa / absrho; - } - } -} diff --git a/common/cuda_hip/solver/multigrid_kernels.hpp.inc b/common/cuda_hip/solver/multigrid_kernels.cpp similarity index 89% rename from common/cuda_hip/solver/multigrid_kernels.hpp.inc rename to common/cuda_hip/solver/multigrid_kernels.cpp index 98b1fcfeff4..61b6ee44836 100644 --- a/common/cuda_hip/solver/multigrid_kernels.hpp.inc +++ b/common/cuda_hip/solver/multigrid_kernels.cpp @@ -2,6 +2,34 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/solver/multigrid_kernels.hpp" + +#include +#include +#include +#include + +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/base/array_access.hpp" +#include "core/components/fill_array_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The MULTIGRID solver namespace. + * + * @ingroup multigrid + */ +namespace multigrid { + + +constexpr int default_block_size = 512; + + namespace kernel { @@ -171,3 +199,9 @@ void kcycle_check_stop(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE( GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL); + + +} // namespace multigrid +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/stop/batch_criteria.hpp.inc b/common/cuda_hip/stop/batch_criteria.hpp similarity index 75% rename from common/cuda_hip/stop/batch_criteria.hpp.inc rename to common/cuda_hip/stop/batch_criteria.hpp index 38072467765..cecaa6b19d1 100644 --- a/common/cuda_hip/stop/batch_criteria.hpp.inc +++ b/common/cuda_hip/stop/batch_criteria.hpp @@ -2,6 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_INC_ +#define GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_INC_ + + +#include + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_stop { + + /** * @see reference/stop/batch_criteria.hpp */ @@ -49,3 +62,11 @@ class SimpleAbsResidual { private: const real_type abs_tol_; }; + + +} // namespace batch_stop +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + +#endif // GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_INC_ \ No newline at end of file diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu deleted file mode 100644 index 704192d0bff..00000000000 --- a/cuda/base/batch_multi_vector_kernels.cu +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/base/batch_multi_vector_kernels.hpp" - -#include -#include - -#include -#include - -#include "common/cuda_hip/base/blas_bindings.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/pointer_mode_guard.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/batch_struct.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The MultiVector matrix format namespace. - * - * @ingroup batch_multi_vector - */ -namespace batch_multi_vector { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" - - -#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_multi_vector -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/base/device_matrix_data_kernels.cu b/cuda/base/device_matrix_data_kernels.cu deleted file mode 100644 index 678c121016c..00000000000 --- a/cuda/base/device_matrix_data_kernels.cu +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/base/device_matrix_data_kernels.hpp" - -#include -#include -#include -#include -#include -#include -#include - -#include "common/cuda_hip/base/types.hpp" -#include "cuda/base/thrust.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace components { - - -#include "common/cuda_hip/base/device_matrix_data_kernels.hpp.inc" - - -} // namespace components -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh deleted file mode 100644 index 4b1d5ac05c3..00000000000 --- a/cuda/base/kernel_launch.cuh +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_HPP_ -#error \ - "This file can only be used from inside common/unified/base/kernel_launch.hpp" -#endif - - -#include - -#include "accessor/cuda_hip_helper.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { - - -template -struct to_device_type_impl&> { - using type = std::decay_t>()))>; - static type map_to_device(gko::acc::range& range) - { - return gko::acc::as_device_range(range); - } -}; - -template -struct to_device_type_impl&> { - using type = std::decay_t>()))>; - static type map_to_device(const gko::acc::range& range) - { - return gko::acc::as_device_range(range); - } -}; - - -namespace device_std = thrust; - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/base/kernel_launch.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh deleted file mode 100644 index 817d19006bc..00000000000 --- a/cuda/base/kernel_launch_reduction.cuh +++ /dev/null @@ -1,28 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ -#error \ - "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp" -#endif - - -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/base/kernel_launch_reduction.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/base/kernel_launch_solver.cuh b/cuda/base/kernel_launch_solver.cuh deleted file mode 100644 index 0d9eaeb2653..00000000000 --- a/cuda/base/kernel_launch_solver.cuh +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_SOLVER_HPP_ -#error \ - "This file can only be used from inside common/unified/base/kernel_launch_solver.hpp" -#endif - - -#include "common/cuda_hip/base/runtime.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/base/kernel_launch_solver.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/components/atomic.cuh b/cuda/components/atomic.cuh deleted file mode 100644 index a9d63677267..00000000000 --- a/cuda/components/atomic.cuh +++ /dev/null @@ -1,28 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_ATOMIC_CUH_ -#define GKO_CUDA_COMPONENTS_ATOMIC_CUH_ - - -#include - -#include "common/cuda_hip/base/types.hpp" -#include "cuda/base/math.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/atomic.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_ATOMIC_CUH_ diff --git a/cuda/components/diagonal_block_manipulation.cuh b/cuda/components/diagonal_block_manipulation.cuh deleted file mode 100644 index 7f19555ace5..00000000000 --- a/cuda/components/diagonal_block_manipulation.cuh +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_CUH_ -#define GKO_CUDA_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_CUH_ - - -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace csr { - - -#include "common/cuda_hip/components/diagonal_block_manipulation.hpp.inc" - - -} // namespace csr -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_CUH_ diff --git a/cuda/components/intrinsics.cuh b/cuda/components/intrinsics.cuh deleted file mode 100644 index d35043c34ce..00000000000 --- a/cuda/components/intrinsics.cuh +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_INTRINSICS_CUH_ -#define GKO_CUDA_COMPONENTS_INTRINSICS_CUH_ - - -#include - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/intrinsics.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_INTRINSICS_CUH_ diff --git a/cuda/components/merging.cuh b/cuda/components/merging.cuh deleted file mode 100644 index 3c7f5e52d47..00000000000 --- a/cuda/components/merging.cuh +++ /dev/null @@ -1,28 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_MERGING_CUH_ -#define GKO_CUDA_COMPONENTS_MERGING_CUH_ - - -#include "core/base/utils.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/searching.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/merging.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_MERGING_CUH_ diff --git a/cuda/components/prefix_sum.cuh b/cuda/components/prefix_sum.cuh deleted file mode 100644 index 6693bbfc326..00000000000 --- a/cuda/components/prefix_sum.cuh +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_PREFIX_SUM_CUH_ -#define GKO_CUDA_COMPONENTS_PREFIX_SUM_CUH_ - - -#include - -#include "common/cuda_hip/base/blas_bindings.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/prefix_sum.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_PREFIX_SUM_CUH_ diff --git a/cuda/components/prefix_sum_kernels.cu b/cuda/components/prefix_sum_kernels.cu deleted file mode 100644 index 60b406ff894..00000000000 --- a/cuda/components/prefix_sum_kernels.cu +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/components/prefix_sum_kernels.hpp" - -#include - -#include - -#include -#include -#include - -#include "cuda/base/thrust.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace components { - - -#include "common/cuda_hip/components/prefix_sum_kernels.hpp.inc" - - -} // namespace components -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh deleted file mode 100644 index 1e4b7cb447c..00000000000 --- a/cuda/components/reduction.cuh +++ /dev/null @@ -1,80 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_REDUCTION_CUH_ -#define GKO_CUDA_COMPONENTS_REDUCTION_CUH_ - - -#include - -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/array_access.hpp" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { - - -constexpr int default_reduce_block_size = 512; - - -#include "common/cuda_hip/components/reduction.hpp.inc" - - -/** - * Compute a reduction using add operation (+). - * - * @param exec Executor associated to the array - * @param size size of the array - * @param source the pointer of the array - * - * @return the reduction result - */ -template -__host__ ValueType reduce_add_array(std::shared_ptr exec, - size_type size, const ValueType* source) -{ - auto block_results_val = source; - size_type grid_dim = size; - auto block_results = array(exec); - if (size > default_reduce_block_size) { - const auto n = ceildiv(size, default_reduce_block_size); - grid_dim = - (n <= default_reduce_block_size) ? n : default_reduce_block_size; - - block_results.resize_and_reset(grid_dim); - - reduce_add_array<<get_stream()>>>( - size, as_device_type(source), - as_device_type(block_results.get_data())); - - block_results_val = block_results.get_const_data(); - } - - auto d_result = array(exec, 1); - - reduce_add_array<<<1, default_reduce_block_size, 0, exec->get_stream()>>>( - grid_dim, as_device_type(block_results_val), - as_device_type(d_result.get_data())); - auto answer = get_element(d_result, 0); - return answer; -} - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_REDUCTION_CUH_ diff --git a/cuda/components/searching.cuh b/cuda/components/searching.cuh deleted file mode 100644 index 5472ac46ed1..00000000000 --- a/cuda/components/searching.cuh +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_SEARCHING_CUH_ -#define GKO_CUDA_COMPONENTS_SEARCHING_CUH_ - - -#include "common/cuda_hip/base/config.hpp" -#include "cuda/components/intrinsics.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/searching.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_SEARCHING_CUH_ diff --git a/cuda/components/segment_scan.cuh b/cuda/components/segment_scan.cuh deleted file mode 100644 index 6ffb8028334..00000000000 --- a/cuda/components/segment_scan.cuh +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_SEGMENT_SCAN_CUH_ -#define GKO_CUDA_COMPONENTS_SEGMENT_SCAN_CUH_ - - -#include "common/cuda_hip/components/cooperative_groups.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/segment_scan.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_SEGMENT_SCAN_CUH_ diff --git a/cuda/components/sorting.cuh b/cuda/components/sorting.cuh deleted file mode 100644 index 59e44d1bb82..00000000000 --- a/cuda/components/sorting.cuh +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_SORTING_CUH_ -#define GKO_CUDA_COMPONENTS_SORTING_CUH_ - - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/sorting.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_SORTING_CUH_ diff --git a/cuda/components/syncfree.cuh b/cuda/components/syncfree.cuh deleted file mode 100644 index 7d519891065..00000000000 --- a/cuda/components/syncfree.cuh +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_SYNCFREE_CUH_ -#define GKO_CUDA_COMPONENTS_SYNCFREE_CUH_ - - -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/memory.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "cuda/components/atomic.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/syncfree.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_SYNCFREE_CUH_ diff --git a/cuda/components/thread_ids.cuh b/cuda/components/thread_ids.cuh deleted file mode 100644 index 1113ea75fc6..00000000000 --- a/cuda/components/thread_ids.cuh +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_THREAD_IDS_CUH_ -#define GKO_CUDA_COMPONENTS_THREAD_IDS_CUH_ - - -#include "common/cuda_hip/base/config.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace thread { - - -#include "common/cuda_hip/components/thread_ids.hpp.inc" - - -} // namespace thread -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_THREAD_IDS_CUH_ diff --git a/cuda/components/warp_blas.cuh b/cuda/components/warp_blas.cuh deleted file mode 100644 index 8e0042cfdad..00000000000 --- a/cuda/components/warp_blas.cuh +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_WARP_BLAS_CUH_ -#define GKO_CUDA_COMPONENTS_WARP_BLAS_CUH_ - - -#include -#include - -#include - -#include "cuda/base/math.hpp" -#include "cuda/components/reduction.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/warp_blas.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_WARP_BLAS_CUH_ diff --git a/cuda/distributed/matrix_kernels.cu b/cuda/distributed/matrix_kernels.cu deleted file mode 100644 index 1cb939d40e7..00000000000 --- a/cuda/distributed/matrix_kernels.cu +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/matrix_kernels.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "cuda/base/thrust.cuh" -#include "cuda/components/atomic.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace distributed_matrix { - - -#include "common/cuda_hip/distributed/matrix_kernels.hpp.inc" - - -} // namespace distributed_matrix -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/distributed/partition_helpers_kernels.cu b/cuda/distributed/partition_helpers_kernels.cu deleted file mode 100644 index 738d478d99a..00000000000 --- a/cuda/distributed/partition_helpers_kernels.cu +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/partition_helpers_kernels.hpp" - -#include -#include -#include -#include - -#include "cuda/base/thrust.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace partition_helpers { - - -#include "common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc" - - -} // namespace partition_helpers -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/distributed/partition_kernels.cu b/cuda/distributed/partition_kernels.cu deleted file mode 100644 index 050d6d285d6..00000000000 --- a/cuda/distributed/partition_kernels.cu +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/partition_kernels.hpp" - -#include -#include -#include -#include -#include -#include - -#include "common/unified/base/kernel_launch.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "cuda/base/thrust.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace partition { - - -#include "common/cuda_hip/distributed/partition_kernels.hpp.inc" - - -} // namespace partition -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/distributed/vector_kernels.cu b/cuda/distributed/vector_kernels.cu deleted file mode 100644 index 60388150da4..00000000000 --- a/cuda/distributed/vector_kernels.cu +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/vector_kernels.hpp" - -#include -#include -#include -#include -#include -#include - -#include - -#include "cuda/base/thrust.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace distributed_vector { - - -#include "common/cuda_hip/distributed/vector_kernels.hpp.inc" - - -} // namespace distributed_vector -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/factorization/cholesky_kernels.cu b/cuda/factorization/cholesky_kernels.cu deleted file mode 100644 index 7d5fe2c3d08..00000000000 --- a/cuda/factorization/cholesky_kernels.cu +++ /dev/null @@ -1,111 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/cholesky_kernels.hpp" - -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -#include "common/cuda_hip/base/sparselib_bindings.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/components/format_conversion_kernels.hpp" -#include "core/factorization/elimination_forest.hpp" -#include "core/factorization/lu_kernels.hpp" -#include "core/matrix/csr_lookup.hpp" -#include "cuda/base/math.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/syncfree.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Cholesky namespace. - * - * @ingroup factor - */ -namespace cholesky { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/factorization/cholesky_kernels.hpp.inc" - - -template -void symbolic_count(std::shared_ptr exec, - const matrix::Csr* mtx, - const factorization::elimination_forest& forest, - IndexType* row_nnz, array& tmp_storage) -{ - const auto num_rows = static_cast(mtx->get_size()[0]); - if (num_rows == 0) { - return; - } - const auto mtx_nnz = static_cast(mtx->get_num_stored_elements()); - tmp_storage.resize_and_reset(mtx_nnz + num_rows); - const auto postorder_cols = tmp_storage.get_data(); - const auto lower_ends = postorder_cols + mtx_nnz; - const auto row_ptrs = mtx->get_const_row_ptrs(); - const auto cols = mtx->get_const_col_idxs(); - const auto inv_postorder = forest.inv_postorder.get_const_data(); - const auto postorder_parent = forest.postorder_parents.get_const_data(); - // transform col indices to postorder indices - { - const auto num_blocks = ceildiv(num_rows, default_block_size); - kernel::build_postorder_cols<<get_stream()>>>( - num_rows, cols, row_ptrs, inv_postorder, postorder_cols, - lower_ends); - } - // sort postorder_cols inside rows - { - const auto handle = exec->get_sparselib_handle(); - auto descr = sparselib::create_mat_descr(); - array permutation_array(exec, mtx_nnz); - auto permutation = permutation_array.get_data(); - components::fill_seq_array(exec, permutation, mtx_nnz); - size_type buffer_size{}; - sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz, - row_ptrs, postorder_cols, buffer_size); - array buffer_array{exec, buffer_size}; - auto buffer = buffer_array.get_data(); - sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs, - postorder_cols, permutation, buffer); - sparselib::destroy(descr); - } - // count nonzeros per row of L - { - const auto num_blocks = - ceildiv(num_rows, default_block_size / config::warp_size); - kernel::symbolic_count - <<get_stream()>>>( - num_rows, row_ptrs, lower_ends, inv_postorder, postorder_cols, - postorder_parent, row_nnz); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT); - - -} // namespace cholesky -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/factorization/factorization_kernels.cu b/cuda/factorization/factorization_kernels.cu deleted file mode 100644 index fcabf3676e6..00000000000 --- a/cuda/factorization/factorization_kernels.cu +++ /dev/null @@ -1,41 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/factorization_kernels.hpp" - -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/array_access.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/csr_builder.hpp" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/searching.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The factorization namespace. - * - * @ingroup factor - */ -namespace factorization { - - -constexpr int default_block_size{512}; - - -#include "common/cuda_hip/factorization/factorization_kernels.hpp.inc" - - -} // namespace factorization -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/factorization/lu_kernels.cu b/cuda/factorization/lu_kernels.cu deleted file mode 100644 index 57ed7ac8531..00000000000 --- a/cuda/factorization/lu_kernels.cu +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/lu_kernels.hpp" - -#include -#include - -#include -#include -#include - -#include - -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/allocator.hpp" -#include "core/matrix/csr_lookup.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/syncfree.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The LU namespace. - * - * @ingroup factor - */ -namespace lu_factorization { - - -constexpr static int default_block_size = 512; - - -#include "common/cuda_hip/factorization/lu_kernels.hpp.inc" - - -} // namespace lu_factorization -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/factorization/par_ic_kernels.cu b/cuda/factorization/par_ic_kernels.cu deleted file mode 100644 index 473272fe1fb..00000000000 --- a/cuda/factorization/par_ic_kernels.cu +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ic_kernels.hpp" - -#include -#include -#include - -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/memory.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The parallel ic factorization namespace. - * - * @ingroup factor - */ -namespace par_ic_factorization { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/factorization/par_ic_kernels.hpp.inc" - - -} // namespace par_ic_factorization -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu deleted file mode 100644 index 1f023892afb..00000000000 --- a/cuda/factorization/par_ilu_kernels.cu +++ /dev/null @@ -1,36 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ilu_kernels.hpp" - -#include - -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/memory.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The parallel ilu factorization namespace. - * - * @ingroup factor - */ -namespace par_ilu_factorization { - - -constexpr int default_block_size{512}; - - -#include "common/cuda_hip/factorization/par_ilu_kernels.hpp.inc" - - -} // namespace par_ilu_factorization -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/log/batch_logger.cuh b/cuda/log/batch_logger.cuh deleted file mode 100644 index 3e53d6ef0a6..00000000000 --- a/cuda/log/batch_logger.cuh +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_LOG_BATCH_LOGGER_CUH_ -#define GKO_CUDA_LOG_BATCH_LOGGER_CUH_ - - -#include - - -namespace gko { -namespace kernels { -namespace cuda { -namespace batch_log { - - -#include "common/cuda_hip/log/batch_logger.hpp.inc" - - -} // namespace batch_log -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_LOG_BATCH_LOGGER_CUH_ diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu deleted file mode 100644 index 4fc5137646c..00000000000 --- a/cuda/matrix/batch_csr_kernels.cu +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_csr_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" -#include "cuda/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Csr matrix format namespace. - * @ref Csr - * @ingroup batch_csr - */ -namespace batch_csr { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_csr -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu deleted file mode 100644 index e28d4f91670..00000000000 --- a/cuda/matrix/batch_dense_kernels.cu +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_dense_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" -#include "cuda/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Dense matrix format namespace. - * - * @ingroup batch_dense - */ -namespace batch_dense { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" - - -// clang-format on - - -} // namespace batch_dense -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu deleted file mode 100644 index 90caf963200..00000000000 --- a/cuda/matrix/batch_ell_kernels.cu +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_ell_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" -#include "cuda/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Ell matrix format namespace. - * @ref Ell - * @ingroup batch_ell - */ -namespace batch_ell { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_ell -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu deleted file mode 100644 index 1536e88345e..00000000000 --- a/cuda/matrix/coo_kernels.cu +++ /dev/null @@ -1,47 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/coo_kernels.hpp" - -#include -#include -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/sparselib_bindings.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/format_conversion.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/segment_scan.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Coordinate matrix format namespace. - * - * @ingroup coo - */ -namespace coo { - - -constexpr int warps_in_block = 4; -constexpr int spmv_block_size = warps_in_block * config::warp_size; - - -#include "common/cuda_hip/matrix/coo_kernels.hpp.inc" - - -} // namespace coo -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu deleted file mode 100644 index b2114f936e7..00000000000 --- a/cuda/matrix/dense_kernels.cu +++ /dev/null @@ -1,230 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/dense_kernels.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common/cuda_hip/base/blas_bindings.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/pointer_mode_guard.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/utils.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Dense matrix format namespace. - * - * @ingroup dense - */ -namespace dense { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/matrix/dense_kernels.hpp.inc" - - -template -void compute_dot_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result, array& tmp) -{ - if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - blas::dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), y->get_stride(), - result->get_values()); - } else { - compute_dot(exec, x, y, result, tmp); - } - } else { - compute_dot(exec, x, y, result, tmp); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); - - -template -void compute_conj_dot_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result, - array& tmp) -{ - if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - blas::conj_dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), - y->get_stride(), result->get_values()); - } else { - compute_conj_dot(exec, x, y, result, tmp); - } - } else { - compute_conj_dot(exec, x, y, result, tmp); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); - - -template -void compute_norm2_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result, - array& tmp) -{ - if (x->get_size()[1] == 1) { - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - blas::norm2(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), result->get_values()); - } else { - compute_norm2(exec, x, result, tmp); - } - } else { - compute_norm2(exec, x, result, tmp); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); - - -template -void simple_apply(std::shared_ptr exec, - const matrix::Dense* a, - const matrix::Dense* b, - matrix::Dense* c) -{ - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { - if (a->get_size()[1] > 0) { - blas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1], - c->get_size()[0], a->get_size()[1], &alpha, - b->get_const_values(), b->get_stride(), - a->get_const_values(), a->get_stride(), &beta, - c->get_values(), c->get_stride()); - } else { - dense::fill(exec, c, zero()); - } - } - } else { - GKO_NOT_IMPLEMENTED; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); - - -template -void apply(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Dense* a, const matrix::Dense* b, - const matrix::Dense* beta, matrix::Dense* c) -{ - if (blas::is_supported::value) { - if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { - if (a->get_size()[1] > 0) { - blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N, - c->get_size()[1], c->get_size()[0], a->get_size()[1], - alpha->get_const_values(), b->get_const_values(), - b->get_stride(), a->get_const_values(), - a->get_stride(), beta->get_const_values(), - c->get_values(), c->get_stride()); - } else { - dense::scale(exec, beta, c); - } - } - } else { - GKO_NOT_IMPLEMENTED; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); - - -template -void transpose(std::shared_ptr exec, - const matrix::Dense* orig, - matrix::Dense* trans) -{ - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { - blas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0], - orig->get_size()[1], &alpha, orig->get_const_values(), - orig->get_stride(), &beta, trans->get_const_values(), - trans->get_stride(), trans->get_values(), - trans->get_stride()); - } - } else { - GKO_NOT_IMPLEMENTED; - } -}; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); - - -template -void conj_transpose(std::shared_ptr exec, - const matrix::Dense* orig, - matrix::Dense* trans) -{ - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { - blas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0], - orig->get_size()[1], &alpha, orig->get_const_values(), - orig->get_stride(), &beta, trans->get_const_values(), - trans->get_stride(), trans->get_values(), - trans->get_stride()); - } - } else { - GKO_NOT_IMPLEMENTED; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); - - -} // namespace dense -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/diagonal_kernels.cu b/cuda/matrix/diagonal_kernels.cu deleted file mode 100644 index 78c0babe3a0..00000000000 --- a/cuda/matrix/diagonal_kernels.cu +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/diagonal_kernels.hpp" - -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Diagonal matrix format namespace. - * - * @ingroup diagonal - */ -namespace diagonal { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/matrix/diagonal_kernels.hpp.inc" - - -} // namespace diagonal -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/fbcsr_kernels.template.cu b/cuda/matrix/fbcsr_kernels.template.cu deleted file mode 100644 index 120a81c247c..00000000000 --- a/cuda/matrix/fbcsr_kernels.template.cu +++ /dev/null @@ -1,299 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/fbcsr_kernels.hpp" - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "common/cuda_hip/base/blas_bindings.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/pointer_mode_guard.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/sparselib_bindings.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/unified/base/kernel_launch.hpp" -#include "core/base/array_access.hpp" -#include "core/base/block_sizes.hpp" -#include "core/base/device_matrix_data_kernels.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/components/format_conversion_kernels.hpp" -#include "core/matrix/csr_lookup.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/cusparse_block_bindings.hpp" -#include "cuda/base/math.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/atomic.cuh" -#include "cuda/components/merging.cuh" -#include "cuda/components/prefix_sum.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { - - -/** - * @brief The fixed-size block compressed sparse row matrix format namespace. - * - * @ingroup fbcsr - */ -namespace fbcsr { - - -constexpr int default_block_size{512}; - - -#include "common/cuda_hip/matrix/csr_common.hpp.inc" -#include "common/cuda_hip/matrix/fbcsr_kernels.hpp.inc" - - -namespace { - - -template -void dense_transpose(std::shared_ptr exec, - const size_type nrows, const size_type ncols, - const size_type orig_stride, const ValueType* const orig, - const size_type trans_stride, ValueType* const trans) -{ - if (nrows == 0) { - return; - } - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - { - blas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig, - orig_stride, &beta, trans, trans_stride, trans, - trans_stride); - } - } else { - GKO_NOT_IMPLEMENTED; - } -} - - -} // namespace - - -template -void spmv(std::shared_ptr exec, - const matrix::Fbcsr* const a, - const matrix::Dense* const b, - matrix::Dense* const c) -{ - if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { - // empty output: nothing to do - return; - } - if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { - // empty input: fill output with zero - dense::fill(exec, c, zero()); - return; - } - if (sparselib::is_supported::value) { - auto handle = exec->get_sparselib_handle(); - sparselib::pointer_mode_guard pm_guard(handle); - const auto alpha = one(); - const auto beta = zero(); - auto descr = sparselib::create_mat_descr(); - const auto row_ptrs = a->get_const_row_ptrs(); - const auto col_idxs = a->get_const_col_idxs(); - const auto values = a->get_const_values(); - const int bs = a->get_block_size(); - const IndexType mb = a->get_num_block_rows(); - const IndexType nb = a->get_num_block_cols(); - const auto nnzb = static_cast(a->get_num_stored_blocks()); - const auto nrhs = static_cast(b->get_size()[1]); - const auto nrows = a->get_size()[0]; - const auto ncols = a->get_size()[1]; - const auto in_stride = b->get_stride(); - const auto out_stride = c->get_stride(); - if (nrhs == 1 && in_stride == 1 && out_stride == 1) { - sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, - nnzb, &alpha, descr, values, row_ptrs, col_idxs, - bs, b->get_const_values(), &beta, c->get_values()); - } else { - const auto trans_stride = nrows; - auto trans_c = array(exec, nrows * nrhs); - sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, - SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, - &alpha, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), in_stride, &beta, - trans_c.get_data(), trans_stride); - dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), - out_stride, c->get_values()); - } - sparselib::destroy(descr); - } else { - GKO_NOT_IMPLEMENTED; - } -} - - -template -void advanced_spmv(std::shared_ptr exec, - const matrix::Dense* const alpha, - const matrix::Fbcsr* const a, - const matrix::Dense* const b, - const matrix::Dense* const beta, - matrix::Dense* const c) -{ - if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { - // empty output: nothing to do - return; - } - if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { - // empty input: scale output - dense::scale(exec, beta, c); - return; - } - if (sparselib::is_supported::value) { - auto handle = exec->get_sparselib_handle(); - const auto alphp = alpha->get_const_values(); - const auto betap = beta->get_const_values(); - auto descr = sparselib::create_mat_descr(); - const auto row_ptrs = a->get_const_row_ptrs(); - const auto col_idxs = a->get_const_col_idxs(); - const auto values = a->get_const_values(); - const int bs = a->get_block_size(); - const IndexType mb = a->get_num_block_rows(); - const IndexType nb = a->get_num_block_cols(); - const auto nnzb = static_cast(a->get_num_stored_blocks()); - const auto nrhs = static_cast(b->get_size()[1]); - const auto nrows = a->get_size()[0]; - const auto ncols = a->get_size()[1]; - const auto in_stride = b->get_stride(); - const auto out_stride = c->get_stride(); - if (nrhs == 1 && in_stride == 1 && out_stride == 1) { - sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, - nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), betap, c->get_values()); - } else { - const auto trans_stride = nrows; - auto trans_c = array(exec, nrows * nrhs); - dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(), - trans_stride, trans_c.get_data()); - sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, - SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, - alphp, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), in_stride, betap, - trans_c.get_data(), trans_stride); - dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), - out_stride, c->get_values()); - } - sparselib::destroy(descr); - } else { - GKO_NOT_IMPLEMENTED; - } -} - - -namespace { - - -template -void transpose_blocks_impl(syn::value_list, - std::shared_ptr exec, - matrix::Fbcsr* const mat) -{ - constexpr int subwarp_size = config::warp_size; - const auto nbnz = mat->get_num_stored_blocks(); - const auto numthreads = nbnz * subwarp_size; - const auto block_size = default_block_size; - const auto grid_dim = ceildiv(numthreads, block_size); - if (grid_dim > 0) { - kernel::transpose_blocks - <<get_stream()>>>( - nbnz, mat->get_values()); - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks, - transpose_blocks_impl); - - -} // namespace - - -template -void transpose(const std::shared_ptr exec, - const matrix::Fbcsr* const orig, - matrix::Fbcsr* const trans) -{ -#ifdef GKO_COMPILING_CUDA - if (sparselib::is_supported::value) { - const int bs = orig->get_block_size(); - const IndexType nnzb = - static_cast(orig->get_num_stored_blocks()); - cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; - cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; - const IndexType buffer_size = sparselib::bsr_transpose_buffersize( - exec->get_sparselib_handle(), orig->get_num_block_rows(), - orig->get_num_block_cols(), nnzb, orig->get_const_values(), - orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs); - array buffer_array(exec, buffer_size); - auto buffer = buffer_array.get_data(); - sparselib::bsr_transpose( - exec->get_sparselib_handle(), orig->get_num_block_rows(), - orig->get_num_block_cols(), nnzb, orig->get_const_values(), - orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs, - trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(), - copyValues, idxBase, buffer); - - // transpose blocks - select_transpose_blocks( - fixedblock::compiled_kernels(), - [bs](int compiled_block_size) { return bs == compiled_block_size; }, - syn::value_list(), syn::type_list<>(), exec, trans); - } else -#endif - { - fallback_transpose(exec, orig, trans); - } -} - - -template -void conj_transpose(std::shared_ptr exec, - const matrix::Fbcsr* orig, - matrix::Fbcsr* trans) -{ - const int grid_size = - ceildiv(trans->get_num_stored_elements(), default_block_size); - transpose(exec, orig, trans); - if (grid_size > 0 && is_complex()) { - kernel:: - conjugate<<get_stream()>>>( - trans->get_num_stored_elements(), - as_device_type(trans->get_values())); - } -} - - -} // namespace fbcsr -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu deleted file mode 100644 index 07f5d5d8ec0..00000000000 --- a/cuda/matrix/sellp_kernels.cu +++ /dev/null @@ -1,42 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/sellp_kernels.hpp" - -#include -#include -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/sparselib_bindings.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The SELL-P matrix format namespace. - * - * @ingroup sellp - */ -namespace sellp { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/matrix/sellp_kernels.hpp.inc" - - -} // namespace sellp -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu deleted file mode 100644 index 17a1e004935..00000000000 --- a/cuda/matrix/sparsity_csr_kernels.cu +++ /dev/null @@ -1,223 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/sparsity_csr_kernels.hpp" - -#include - -#include - -#include "accessor/cuda_hip_helper.hpp" -#include "accessor/reduced_row_major.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/sparselib_bindings.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/mixed_precision_types.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/components/format_conversion_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/math.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Compressed sparse row matrix format namespace. - * - * @ingroup sparsity - */ -namespace sparsity_csr { - - -constexpr int classical_oversubscription = 32; -constexpr int default_block_size = 512; -#ifdef GKO_COMPILING_HIP -constexpr int spmv_block_size = 256; -#else -constexpr int spmv_block_size = 128; -#endif -constexpr int warps_in_block = 4; - - -using classical_kernels = syn::value_list; - - -#include "common/cuda_hip/matrix/csr_common.hpp.inc" -#include "common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc" - - -namespace host_kernel { - - -template -void classical_spmv(syn::value_list, - std::shared_ptr exec, - const matrix::SparsityCsr* a, - const matrix::Dense* b, - matrix::Dense* c, - const matrix::Dense* alpha = nullptr, - const matrix::Dense* beta = nullptr) -{ - using arithmetic_type = - highest_precision; - using input_accessor = - gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>; - using output_accessor = - gko::acc::reduced_row_major<2, arithmetic_type, OutputValueType>; - - const auto nwarps = exec->get_num_warps_per_sm() * - exec->get_num_multiprocessor() * - classical_oversubscription; - const auto gridx = - std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size), - int64(nwarps / warps_in_block)); - const dim3 grid(gridx, b->get_size()[1]); - const auto block = spmv_block_size; - - const auto b_vals = gko::acc::range( - std::array{ - {static_cast(b->get_size()[0]), - static_cast(b->get_size()[1])}}, - b->get_const_values(), - std::array{ - {static_cast(b->get_stride())}}); - auto c_vals = gko::acc::range( - std::array{ - {static_cast(c->get_size()[0]), - static_cast(c->get_size()[1])}}, - c->get_values(), - std::array{ - {static_cast(c->get_stride())}}); - if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { - // empty output: nothing to do - return; - } - if (alpha == nullptr && beta == nullptr) { - kernel::abstract_classical_spmv - <<get_stream()>>>( - a->get_size()[0], as_device_type(a->get_const_value()), - a->get_const_col_idxs(), - as_device_type(a->get_const_row_ptrs()), - acc::as_device_range(b_vals), acc::as_device_range(c_vals)); - } else if (alpha != nullptr && beta != nullptr) { - kernel::abstract_classical_spmv - <<get_stream()>>>( - a->get_size()[0], as_device_type(alpha->get_const_values()), - as_device_type(a->get_const_value()), a->get_const_col_idxs(), - as_device_type(a->get_const_row_ptrs()), - acc::as_device_range(b_vals), - as_device_type(beta->get_const_values()), - acc::as_device_range(c_vals)); - } else { - GKO_KERNEL_NOT_FOUND; - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv); - - -} // namespace host_kernel - -template -void spmv(std::shared_ptr exec, - const matrix::SparsityCsr* a, - const matrix::Dense* b, - matrix::Dense* c) -{ - host_kernel::select_classical_spmv( - classical_kernels(), [](int compiled_info) { return true; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); -} - -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL); - - -template -void advanced_spmv(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::SparsityCsr* a, - const matrix::Dense* b, - const matrix::Dense* beta, - matrix::Dense* c) -{ - host_kernel::select_classical_spmv( - classical_kernels(), [](int compiled_info) { return true; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, beta); -} - -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); - - -template -void sort_by_column_index(std::shared_ptr exec, - matrix::SparsityCsr* to_sort) -{ - const auto nnz = static_cast(to_sort->get_num_nonzeros()); - const auto num_rows = static_cast(to_sort->get_size()[0]); - const auto num_cols = static_cast(to_sort->get_size()[1]); - const auto row_ptrs = to_sort->get_const_row_ptrs(); - const auto col_idxs = to_sort->get_col_idxs(); - if (sparselib::is_supported::value) { - const auto handle = exec->get_sparselib_handle(); - auto descr = sparselib::create_mat_descr(); - array permutation_array(exec, to_sort->get_num_nonzeros()); - auto permutation = permutation_array.get_data(); - components::fill_seq_array(exec, permutation, - to_sort->get_num_nonzeros()); - size_type buffer_size{}; - sparselib::csrsort_buffer_size(handle, num_rows, num_cols, nnz, - row_ptrs, col_idxs, buffer_size); - array buffer_array{exec, buffer_size}; - auto buffer = buffer_array.get_data(); - sparselib::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs, - col_idxs, permutation, buffer); - sparselib::destroy(descr); - } else { - fallback_sort(exec, to_sort); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX); - - -template -void is_sorted_by_column_index( - std::shared_ptr exec, - const matrix::SparsityCsr* to_check, bool* is_sorted) -{ - *is_sorted = true; - auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted); - auto gpu_array = array{exec, cpu_array}; - const auto num_rows = static_cast(to_check->get_size()[0]); - auto num_blocks = ceildiv(num_rows, default_block_size); - if (num_blocks > 0) { - kernel::check_unsorted<<get_stream()>>>( - to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(), - num_rows, gpu_array.get_data()); - } - cpu_array = gpu_array; -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX); - - -} // namespace sparsity_csr -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/multigrid/pgm_kernels.cu b/cuda/multigrid/pgm_kernels.cu deleted file mode 100644 index 399d8a06c1b..00000000000 --- a/cuda/multigrid/pgm_kernels.cu +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/multigrid/pgm_kernels.hpp" - -#include - -#include -#include -#include -#include -#include - -#include -#include - -#include "common/cuda_hip/base/types.hpp" -#include "cuda/base/thrust.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The PGM solver namespace. - * - * @ingroup pgm - */ -namespace pgm { - - -#include "common/cuda_hip/multigrid/pgm_kernels.hpp.inc" - - -} // namespace pgm -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/preconditioner/isai_kernels.cu b/cuda/preconditioner/isai_kernels.cu deleted file mode 100644 index 8867bf643b0..00000000000 --- a/cuda/preconditioner/isai_kernels.cu +++ /dev/null @@ -1,47 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/preconditioner/isai_kernels.hpp" - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/csr_builder.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/merging.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" -#include "cuda/components/warp_blas.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Isai preconditioner namespace. - * @ref Isai - * @ingroup isai - */ -namespace isai { - - -constexpr int subwarp_size{row_size_limit}; -constexpr int subwarps_per_block{2}; -constexpr int default_block_size{subwarps_per_block * subwarp_size}; - - -#include "common/cuda_hip/preconditioner/isai_kernels.hpp.inc" - - -} // namespace isai -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu deleted file mode 100644 index 783de652733..00000000000 --- a/cuda/preconditioner/jacobi_kernels.cu +++ /dev/null @@ -1,49 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/preconditioner/jacobi_kernels.hpp" - -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/extended_float.hpp" -#include "core/preconditioner/jacobi_utils.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/thread_ids.cuh" -#include "cuda/preconditioner/jacobi_common.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Jacobi preconditioner namespace. - * @ref Jacobi - * @ingroup jacobi - */ -namespace jacobi { - - -// a total of 32/16 warps (1024 threads) -#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC -constexpr int default_num_warps = 16; -#else // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC -constexpr int default_num_warps = 32; -#endif -// with current architectures, at most 32 warps can be scheduled per SM (and -// current GPUs have at most 84 SMs) -constexpr int default_grid_size = 32 * 32 * 128; - - -#include "common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc" - - -} // namespace jacobi -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/reorder/rcm_kernels.cu b/cuda/reorder/rcm_kernels.cu deleted file mode 100644 index 8308cf88e60..00000000000 --- a/cuda/reorder/rcm_kernels.cu +++ /dev/null @@ -1,51 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/reorder/rcm_kernels.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "common/cuda_hip/components/memory.hpp" -#include "core/base/array_access.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The reordering namespace. - * - * @ingroup reorder - */ -namespace rcm { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/reorder/rcm_kernels.hpp.inc" - - -} // namespace rcm -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu deleted file mode 100644 index 8b1a28d5581..00000000000 --- a/cuda/solver/cb_gmres_kernels.cu +++ /dev/null @@ -1,504 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/solver/cb_gmres_kernels.hpp" - -#include - -#include -#include -#include -#include - -#include "accessor/cuda_hip_helper.hpp" -#include "accessor/range.hpp" -#include "accessor/reduced_row_major.hpp" -#include "accessor/scaled_reduced_row_major.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/array_access.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "core/solver/cb_gmres_accessor.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The CB_GMRES solver namespace. - * - * @ingroup cb_gmres - */ -namespace cb_gmres { - - -constexpr int default_block_size = 512; -// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block -// size limit. -constexpr int default_dot_dim = 32; -constexpr int default_dot_size = default_dot_dim * default_dot_dim; - - -#include "common/cuda_hip/solver/cb_gmres_kernels.hpp.inc" - - -template -void zero_matrix(std::shared_ptr exec, size_type m, - size_type n, size_type stride, ValueType* array) -{ - const auto block_size = default_block_size; - const auto grid_size = ceildiv(n, block_size); - zero_matrix_kernel<<get_stream()>>>( - m, n, stride, as_device_type(array)); -} - - -template -void initialize(std::shared_ptr exec, - const matrix::Dense* b, - matrix::Dense* residual, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - array* stop_status, size_type krylov_dim) -{ - const auto num_threads = std::max(b->get_size()[0] * b->get_stride(), - krylov_dim * b->get_size()[1]); - const auto grid_dim = ceildiv(num_threads, default_block_size); - const auto block_dim = default_block_size; - constexpr auto block_size = default_block_size; - - initialize_kernel - <<get_stream()>>>( - b->get_size()[0], b->get_size()[1], krylov_dim, - as_device_type(b->get_const_values()), b->get_stride(), - as_device_type(residual->get_values()), residual->get_stride(), - as_device_type(givens_sin->get_values()), givens_sin->get_stride(), - as_device_type(givens_cos->get_values()), givens_cos->get_stride(), - as_device_type(stop_status->get_data())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); - - -template -void restart(std::shared_ptr exec, - const matrix::Dense* residual, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense>* arnoldi_norm, - Accessor3d krylov_bases, - matrix::Dense* next_krylov_basis, - array* final_iter_nums, array& reduction_tmp, - size_type krylov_dim) -{ - constexpr bool use_scalar = - gko::cb_gmres::detail::has_3d_scaled_accessor::value; - const auto num_rows = residual->get_size()[0]; - const auto num_rhs = residual->get_size()[1]; - const auto krylov_stride = - gko::cb_gmres::helper_functions_accessor::get_stride( - krylov_bases); - const auto grid_dim_1 = - ceildiv((krylov_dim + 1) * krylov_stride[0], default_block_size); - const auto block_dim = default_block_size; - constexpr auto block_size = default_block_size; - const auto stride_arnoldi = arnoldi_norm->get_stride(); - - restart_1_kernel - <<get_stream()>>>( - residual->get_size()[0], residual->get_size()[1], krylov_dim, - acc::as_device_range(krylov_bases), - as_device_type(residual_norm_collection->get_values()), - residual_norm_collection->get_stride()); - kernels::cuda::dense::compute_norm2_dispatch(exec, residual, residual_norm, - reduction_tmp); - - if (use_scalar) { - components::fill_array(exec, - arnoldi_norm->get_values() + 2 * stride_arnoldi, - num_rhs, zero>()); - const dim3 grid_size_nrm(ceildiv(num_rhs, default_dot_dim), - exec->get_num_multiprocessor() * 2); - const dim3 block_size_nrm(default_dot_dim, default_dot_dim); - multinorminf_without_stop_kernel<<get_stream()>>>( - num_rows, num_rhs, as_device_type(residual->get_const_values()), - residual->get_stride(), - as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), 0); - } - - if (gko::cb_gmres::detail::has_3d_scaled_accessor::value) { - set_scalar_kernel - <<get_stream()>>>( - num_rhs, krylov_dim + 1, - as_device_type(residual_norm->get_const_values()), - residual_norm->get_stride(), - as_device_type(arnoldi_norm->get_const_values() + - 2 * stride_arnoldi), - stride_arnoldi, acc::as_device_range(krylov_bases)); - } - - const auto grid_dim_2 = - ceildiv(std::max(num_rows, 1) * krylov_stride[1], - default_block_size); - restart_2_kernel - <<get_stream()>>>( - residual->get_size()[0], residual->get_size()[1], - as_device_type(residual->get_const_values()), - residual->get_stride(), - as_device_type(residual_norm->get_const_values()), - as_device_type(residual_norm_collection->get_values()), - acc::as_device_range(krylov_bases), - as_device_type(next_krylov_basis->get_values()), - next_krylov_basis->get_stride(), - as_device_type(final_iter_nums->get_data())); -} - -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL); - - -template -void finish_arnoldi_CGS(std::shared_ptr exec, - matrix::Dense* next_krylov_basis, - Accessor3dim krylov_bases, - matrix::Dense* hessenberg_iter, - matrix::Dense* buffer_iter, - matrix::Dense>* arnoldi_norm, - size_type iter, const stopping_status* stop_status, - stopping_status* reorth_status, - array* num_reorth) -{ - const auto dim_size = next_krylov_basis->get_size(); - if (dim_size[1] == 0) { - return; - } - using non_complex = remove_complex; - // optimization parameter - constexpr int singledot_block_size = default_dot_dim; - constexpr bool use_scalar = - gko::cb_gmres::detail::has_3d_scaled_accessor::value; - const auto stride_next_krylov = next_krylov_basis->get_stride(); - const auto stride_hessenberg = hessenberg_iter->get_stride(); - const auto stride_buffer = buffer_iter->get_stride(); - const auto stride_arnoldi = arnoldi_norm->get_stride(); - const dim3 grid_size(ceildiv(dim_size[1], default_dot_dim), - exec->get_num_multiprocessor() * 2); - const dim3 grid_size_num_iters(ceildiv(dim_size[1], default_dot_dim), - exec->get_num_multiprocessor() * 2, - iter + 1); - const dim3 block_size(default_dot_dim, default_dot_dim); - // Note: having iter first (instead of row_idx information) is likely - // beneficial for avoiding atomic_add conflicts, but that needs - // further investigation. - const dim3 grid_size_iters_single(exec->get_num_multiprocessor() * 2, - iter + 1); - const auto block_size_iters_single = singledot_block_size; - size_type num_reorth_host; - - components::fill_array(exec, arnoldi_norm->get_values(), dim_size[1], - zero()); - multinorm2_kernel<<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, as_device_type(arnoldi_norm->get_values()), - as_device_type(stop_status)); - // nrmP = norm(next_krylov_basis) - zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg, - hessenberg_iter->get_values()); - if (dim_size[1] > 1) { - multidot_kernel - <<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_device_range(krylov_bases), - as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, as_device_type(stop_status)); - } else { - singledot_kernel - <<get_stream()>>>( - dim_size[0], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_device_range(krylov_bases), - as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, as_device_type(stop_status)); - } - // for i in 1:iter - // hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i) - // end - update_next_krylov_kernel - <<get_stream()>>>( - iter + 1, dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_values()), stride_next_krylov, - acc::as_device_range(krylov_bases), - as_device_type(hessenberg_iter->get_const_values()), - stride_hessenberg, as_device_type(stop_status)); - - // for i in 1:iter - // next_krylov_basis -= hessenberg(iter, i) * krylov_bases(:, i) - // end - components::fill_array(exec, arnoldi_norm->get_values() + stride_arnoldi, - dim_size[1], zero()); - if (use_scalar) { - components::fill_array(exec, - arnoldi_norm->get_values() + 2 * stride_arnoldi, - dim_size[1], zero()); - } - multinorm2_inf_kernel - <<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, - as_device_type(arnoldi_norm->get_values() + stride_arnoldi), - as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), - as_device_type(stop_status)); - // nrmN = norm(next_krylov_basis) - components::fill_array(exec, num_reorth->get_data(), 1, zero()); - check_arnoldi_norms - <<get_stream()>>>( - dim_size[1], as_device_type(arnoldi_norm->get_values()), - stride_arnoldi, as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases), - as_device_type(stop_status), as_device_type(reorth_status), - as_device_type(num_reorth->get_data())); - num_reorth_host = get_element(*num_reorth, 0); - // num_reorth_host := number of next_krylov vector to be reorthogonalization - for (size_type l = 1; (num_reorth_host > 0) && (l < 3); l++) { - zero_matrix(exec, iter + 1, dim_size[1], stride_buffer, - buffer_iter->get_values()); - if (dim_size[1] > 1) { - multidot_kernel - <<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_device_range(krylov_bases), - as_device_type(buffer_iter->get_values()), stride_buffer, - as_device_type(stop_status)); - } else { - singledot_kernel - <<get_stream()>>>( - dim_size[0], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_device_range(krylov_bases), - as_device_type(buffer_iter->get_values()), stride_buffer, - as_device_type(stop_status)); - } - // for i in 1:iter - // hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i) - // end - update_next_krylov_and_add_kernel - <<get_stream()>>>( - iter + 1, dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_values()), - stride_next_krylov, acc::as_device_range(krylov_bases), - as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, - as_device_type(buffer_iter->get_const_values()), stride_buffer, - as_device_type(stop_status), as_device_type(reorth_status)); - // for i in 1:iter - // next_krylov_basis -= hessenberg(iter, i) * krylov_bases(:, i) - // end - components::fill_array(exec, - arnoldi_norm->get_values() + stride_arnoldi, - dim_size[1], zero()); - if (use_scalar) { - components::fill_array( - exec, arnoldi_norm->get_values() + 2 * stride_arnoldi, - dim_size[1], zero()); - } - multinorm2_inf_kernel - <<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, - as_device_type(arnoldi_norm->get_values() + stride_arnoldi), - as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), - as_device_type(stop_status)); - // nrmN = norm(next_krylov_basis) - components::fill_array(exec, num_reorth->get_data(), 1, - zero()); - check_arnoldi_norms - <<get_stream()>>>( - dim_size[1], as_device_type(arnoldi_norm->get_values()), - stride_arnoldi, as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases), - as_device_type(stop_status), as_device_type(reorth_status), - num_reorth->get_data()); - num_reorth_host = get_element(*num_reorth, 0); - // num_reorth_host := number of next_krylov vector to be - // reorthogonalization - } - update_krylov_next_krylov_kernel - <<get_stream()>>>( - iter, dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_values()), stride_next_krylov, - acc::as_device_range(krylov_bases), - as_device_type(hessenberg_iter->get_const_values()), - stride_hessenberg, as_device_type(stop_status)); - // next_krylov_basis /= hessenberg(iter, iter + 1) - // krylov_bases(:, iter + 1) = next_krylov_basis - // End of arnoldi -} - -template -void givens_rotation(std::shared_ptr exec, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense* hessenberg_iter, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - size_type iter, const array* stop_status) -{ - // TODO: tune block_size for optimal performance - constexpr auto block_size = default_block_size; - const auto num_cols = hessenberg_iter->get_size()[1]; - const auto block_dim = block_size; - const auto grid_dim = - static_cast(ceildiv(num_cols, block_size)); - - givens_rotation_kernel - <<get_stream()>>>( - hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1], - iter, as_device_type(hessenberg_iter->get_values()), - hessenberg_iter->get_stride(), - as_device_type(givens_sin->get_values()), givens_sin->get_stride(), - as_device_type(givens_cos->get_values()), givens_cos->get_stride(), - as_device_type(residual_norm->get_values()), - as_device_type(residual_norm_collection->get_values()), - residual_norm_collection->get_stride(), - stop_status->get_const_data()); -} - - -template -void arnoldi(std::shared_ptr exec, - matrix::Dense* next_krylov_basis, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - Accessor3d krylov_bases, matrix::Dense* hessenberg_iter, - matrix::Dense* buffer_iter, - matrix::Dense>* arnoldi_norm, - size_type iter, array* final_iter_nums, - const array* stop_status, - array* reorth_status, - array* num_reorth) -{ - increase_final_iteration_numbers_kernel<<< - static_cast( - ceildiv(final_iter_nums->get_size(), default_block_size)), - default_block_size, 0, exec->get_stream()>>>( - as_device_type(final_iter_nums->get_data()), - stop_status->get_const_data(), final_iter_nums->get_size()); - finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter, - buffer_iter, arnoldi_norm, iter, - stop_status->get_const_data(), reorth_status->get_data(), - num_reorth); - givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter, - residual_norm, residual_norm_collection, iter, stop_status); -} - -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL); - - -template -void solve_upper_triangular( - std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - const matrix::Dense* hessenberg, matrix::Dense* y, - const array* final_iter_nums) -{ - // TODO: tune block_size for optimal performance - constexpr auto block_size = default_block_size; - const auto num_rhs = residual_norm_collection->get_size()[1]; - const auto block_dim = block_size; - const auto grid_dim = - static_cast(ceildiv(num_rhs, block_size)); - - solve_upper_triangular_kernel - <<get_stream()>>>( - hessenberg->get_size()[1], num_rhs, - as_device_type(residual_norm_collection->get_const_values()), - residual_norm_collection->get_stride(), - as_device_type(hessenberg->get_const_values()), - hessenberg->get_stride(), as_device_type(y->get_values()), - y->get_stride(), as_device_type(final_iter_nums->get_const_data())); -} - - -template -void calculate_qy(std::shared_ptr exec, - ConstAccessor3d krylov_bases, size_type num_krylov_bases, - const matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) -{ - const auto num_rows = before_preconditioner->get_size()[0]; - const auto num_cols = before_preconditioner->get_size()[1]; - const auto stride_before_preconditioner = - before_preconditioner->get_stride(); - - constexpr auto block_size = default_block_size; - const auto grid_dim = static_cast( - ceildiv(num_rows * stride_before_preconditioner, block_size)); - const auto block_dim = block_size; - - calculate_Qy_kernel - <<get_stream()>>>( - num_rows, num_cols, acc::as_device_range(krylov_bases), - as_device_type(y->get_const_values()), y->get_stride(), - as_device_type(before_preconditioner->get_values()), - stride_before_preconditioner, - as_device_type(final_iter_nums->get_const_data())); - // Calculate qy - // before_preconditioner = krylov_bases * y -} - - -template -void solve_krylov(std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - ConstAccessor3d krylov_bases, - const matrix::Dense* hessenberg, - matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) -{ - if (before_preconditioner->get_size()[1] == 0) { - return; - } - // since hessenberg has dims: iters x iters * num_rhs - // krylov_bases has dims: (iters + 1) x sysmtx[0] x num_rhs - const auto iters = - hessenberg->get_size()[1] / before_preconditioner->get_size()[1]; - const auto num_krylov_bases = iters + 1; - solve_upper_triangular(exec, residual_norm_collection, hessenberg, y, - final_iter_nums); - calculate_qy(exec, krylov_bases, num_krylov_bases, y, before_preconditioner, - final_iter_nums); -} - -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE( - GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL); - - -} // namespace cb_gmres -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/solver/multigrid_kernels.cu b/cuda/solver/multigrid_kernels.cu deleted file mode 100644 index 6001d42614d..00000000000 --- a/cuda/solver/multigrid_kernels.cu +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/solver/multigrid_kernels.hpp" - -#include -#include -#include -#include - -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "core/base/array_access.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The MULTIGRID solver namespace. - * - * @ingroup multigrid - */ -namespace multigrid { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/solver/multigrid_kernels.hpp.inc" - - -} // namespace multigrid -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/stop/batch_criteria.cuh b/cuda/stop/batch_criteria.cuh deleted file mode 100644 index f4f434dda11..00000000000 --- a/cuda/stop/batch_criteria.cuh +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_STOP_BATCH_CRITERIA_CUH_ -#define GKO_CUDA_STOP_BATCH_CRITERIA_CUH_ - - -#include - - -namespace gko { -namespace kernels { -namespace cuda { -namespace batch_stop { - - -#include "common/cuda_hip/stop/batch_criteria.hpp.inc" - - -} // namespace batch_stop -} // namespace cuda -} // namespace kernels -} // namespace gko - -#endif // GKO_CUDA_STOP_BATCH_CRITERIA_CUH_ diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp deleted file mode 100644 index 86b16c8975d..00000000000 --- a/hip/base/batch_multi_vector_kernels.hip.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/base/batch_multi_vector_kernels.hpp" - -#include -#include - -#include -#include - -#include "common/cuda_hip/base/blas_bindings.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/pointer_mode_guard.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/batch_struct.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The MultiVector matrix format namespace. - * - * @ingroup batch_multi_vector - */ -namespace batch_multi_vector { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" - - -#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_multi_vector -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/base/device_matrix_data_kernels.hip.cpp b/hip/base/device_matrix_data_kernels.hip.cpp deleted file mode 100644 index d63a8e27ed5..00000000000 --- a/hip/base/device_matrix_data_kernels.hip.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/base/device_matrix_data_kernels.hpp" - -#include -#include -#include -#include -#include -#include -#include - -#include "common/cuda_hip/base/types.hpp" -#include "hip/base/thrust.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace components { - - -#include "common/cuda_hip/base/device_matrix_data_kernels.hpp.inc" - - -} // namespace components -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/base/kernel_launch.hip.hpp b/hip/base/kernel_launch.hip.hpp deleted file mode 100644 index ff9f398c0bc..00000000000 --- a/hip/base/kernel_launch.hip.hpp +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_HPP_ -#error \ - "This file can only be used from inside common/unified/base/kernel_launch.hpp" -#endif - - -#include - -#include "accessor/cuda_hip_helper.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -template -struct to_device_type_impl&> { - using type = std::decay_t>()))>; - static type map_to_device(gko::acc::range& range) - { - return gko::acc::as_device_range(range); - } -}; - -template -struct to_device_type_impl&> { - using type = std::decay_t>()))>; - static type map_to_device(const gko::acc::range& range) - { - return gko::acc::as_device_range(range); - } -}; - - -namespace device_std = thrust; - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/base/kernel_launch.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp deleted file mode 100644 index c32fb592de0..00000000000 --- a/hip/base/kernel_launch_reduction.hip.hpp +++ /dev/null @@ -1,28 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ -#error \ - "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp" -#endif - - -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/base/kernel_launch_reduction.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/base/kernel_launch_solver.hip.hpp b/hip/base/kernel_launch_solver.hip.hpp deleted file mode 100644 index eda18f35eab..00000000000 --- a/hip/base/kernel_launch_solver.hip.hpp +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_SOLVER_HPP_ -#error \ - "This file can only be used from inside common/unified/base/kernel_launch_solver.hpp" -#endif - - -#include "common/cuda_hip/base/runtime.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/base/kernel_launch_solver.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/components/atomic.hip.hpp b/hip/components/atomic.hip.hpp deleted file mode 100644 index 64d39a90d78..00000000000 --- a/hip/components/atomic.hip.hpp +++ /dev/null @@ -1,28 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_ATOMIC_HIP_HPP_ -#define GKO_HIP_COMPONENTS_ATOMIC_HIP_HPP_ - - -#include - -#include "common/cuda_hip/base/types.hpp" -#include "hip/base/math.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/atomic.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_ATOMIC_HIP_HPP_ diff --git a/hip/components/diagonal_block_manipulation.hip.hpp b/hip/components/diagonal_block_manipulation.hip.hpp deleted file mode 100644 index 7a3893fa031..00000000000 --- a/hip/components/diagonal_block_manipulation.hip.hpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HIP_HPP_ -#define GKO_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HIP_HPP_ - - -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace csr { - - -#include "common/cuda_hip/components/diagonal_block_manipulation.hpp.inc" - - -} // namespace csr -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HIP_HPP_ diff --git a/hip/components/intrinsics.hip.hpp b/hip/components/intrinsics.hip.hpp deleted file mode 100644 index af849d4471a..00000000000 --- a/hip/components/intrinsics.hip.hpp +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_INTRINSICS_HIP_HPP_ -#define GKO_HIP_COMPONENTS_INTRINSICS_HIP_HPP_ - - -#include - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/intrinsics.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_INTRINSICS_HIP_HPP_ diff --git a/hip/components/merging.hip.hpp b/hip/components/merging.hip.hpp deleted file mode 100644 index 3f031947940..00000000000 --- a/hip/components/merging.hip.hpp +++ /dev/null @@ -1,28 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_MERGING_HIP_HPP_ -#define GKO_HIP_COMPONENTS_MERGING_HIP_HPP_ - - -#include "core/base/utils.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/searching.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/merging.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_MERGING_HIP_HPP_ diff --git a/hip/components/prefix_sum.hip.hpp b/hip/components/prefix_sum.hip.hpp deleted file mode 100644 index deb78288e6c..00000000000 --- a/hip/components/prefix_sum.hip.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_PREFIX_SUM_HIP_HPP_ -#define GKO_HIP_COMPONENTS_PREFIX_SUM_HIP_HPP_ - - -#include - -#include "common/cuda_hip/base/blas_bindings.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/prefix_sum.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_PREFIX_SUM_HIP_HPP_ diff --git a/hip/components/prefix_sum_kernels.hip.cpp b/hip/components/prefix_sum_kernels.hip.cpp deleted file mode 100644 index 283e8c161a1..00000000000 --- a/hip/components/prefix_sum_kernels.hip.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/components/prefix_sum_kernels.hpp" - -#include - -#include - -#include -#include -#include - -#include "hip/base/thrust.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace components { - - -#include "common/cuda_hip/components/prefix_sum_kernels.hpp.inc" - - -} // namespace components -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/components/reduction.hip.hpp b/hip/components/reduction.hip.hpp deleted file mode 100644 index bc2594dd96d..00000000000 --- a/hip/components/reduction.hip.hpp +++ /dev/null @@ -1,80 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_REDUCTION_HIP_HPP_ -#define GKO_HIP_COMPONENTS_REDUCTION_HIP_HPP_ - - -#include - -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/array_access.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -constexpr int default_reduce_block_size = 512; - - -#include "common/cuda_hip/components/reduction.hpp.inc" - - -/** - * Compute a reduction using add operation (+). - * - * @param exec Executor associated to the array - * @param size size of the array - * @param source the pointer of the array - * - * @return the reduction result - */ -template -__host__ ValueType reduce_add_array(std::shared_ptr exec, - size_type size, const ValueType* source) -{ - auto block_results_val = source; - size_type grid_dim = size; - auto block_results = array(exec); - if (size > default_reduce_block_size) { - const auto n = ceildiv(size, default_reduce_block_size); - grid_dim = - (n <= default_reduce_block_size) ? n : default_reduce_block_size; - - block_results.resize_and_reset(grid_dim); - - reduce_add_array<<get_stream()>>>( - size, as_device_type(source), - as_device_type(block_results.get_data())); - - block_results_val = block_results.get_const_data(); - } - - auto d_result = array(exec, 1); - - reduce_add_array<<<1, default_reduce_block_size, 0, exec->get_stream()>>>( - grid_dim, as_device_type(block_results_val), - as_device_type(d_result.get_data())); - auto answer = get_element(d_result, 0); - return answer; -} - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_REDUCTION_HIP_HPP_ diff --git a/hip/components/searching.hip.hpp b/hip/components/searching.hip.hpp deleted file mode 100644 index 9222de9e1d6..00000000000 --- a/hip/components/searching.hip.hpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_ -#define GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_ - - -#include "common/cuda_hip/base/config.hpp" -#include "hip/components/intrinsics.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/searching.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_ diff --git a/hip/components/segment_scan.hip.hpp b/hip/components/segment_scan.hip.hpp deleted file mode 100644 index 93ebb35833a..00000000000 --- a/hip/components/segment_scan.hip.hpp +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_ -#define GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_ - - -#include "common/cuda_hip/components/cooperative_groups.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/segment_scan.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_ diff --git a/hip/components/sorting.hip.hpp b/hip/components/sorting.hip.hpp deleted file mode 100644 index 4a664aee453..00000000000 --- a/hip/components/sorting.hip.hpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_SORTING_HIP_HPP_ -#define GKO_HIP_COMPONENTS_SORTING_HIP_HPP_ - - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/sorting.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_SORTING_HIP_HPP_ diff --git a/hip/components/syncfree.hip.hpp b/hip/components/syncfree.hip.hpp deleted file mode 100644 index c174224c9c4..00000000000 --- a/hip/components/syncfree.hip.hpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_SYNCFREE_HIP_HPP_ -#define GKO_HIP_COMPONENTS_SYNCFREE_HIP_HPP_ - - -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/memory.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "hip/components/atomic.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/syncfree.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_SYNCFREE_HIP_HPP_ diff --git a/hip/components/thread_ids.hip.hpp b/hip/components/thread_ids.hip.hpp deleted file mode 100644 index 6f0bd44ba9c..00000000000 --- a/hip/components/thread_ids.hip.hpp +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_ -#define GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_ - - -#include "common/cuda_hip/base/config.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace thread { - - -#include "common/cuda_hip/components/thread_ids.hpp.inc" - - -} // namespace thread -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_ diff --git a/hip/components/warp_blas.hip.hpp b/hip/components/warp_blas.hip.hpp deleted file mode 100644 index 9164a1914b3..00000000000 --- a/hip/components/warp_blas.hip.hpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_WARP_BLAS_HIP_HPP_ -#define GKO_HIP_COMPONENTS_WARP_BLAS_HIP_HPP_ - - -#include -#include - -#include - -#include "hip/base/math.hip.hpp" -#include "hip/components/reduction.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/warp_blas.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_WARP_BLAS_HIP_HPP_ diff --git a/hip/distributed/matrix_kernels.hip.cpp b/hip/distributed/matrix_kernels.hip.cpp deleted file mode 100644 index 535fdaacb44..00000000000 --- a/hip/distributed/matrix_kernels.hip.cpp +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/matrix_kernels.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "hip/base/thrust.hip.hpp" -#include "hip/components/atomic.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace distributed_matrix { - - -#include "common/cuda_hip/distributed/matrix_kernels.hpp.inc" - - -} // namespace distributed_matrix -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/distributed/partition_helpers_kernels.hip.cpp b/hip/distributed/partition_helpers_kernels.hip.cpp deleted file mode 100644 index a2083a55303..00000000000 --- a/hip/distributed/partition_helpers_kernels.hip.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/partition_helpers_kernels.hpp" - -#include -#include -#include -#include - -#include "hip/base/thrust.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace partition_helpers { - - -#include "common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc" - - -} // namespace partition_helpers -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/distributed/partition_kernels.hip.cpp b/hip/distributed/partition_kernels.hip.cpp deleted file mode 100644 index c2c4a8f28ea..00000000000 --- a/hip/distributed/partition_kernels.hip.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/partition_kernels.hpp" - -#include -#include -#include -#include -#include -#include - -#include "common/unified/base/kernel_launch.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "hip/base/thrust.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace partition { - - -#include "common/cuda_hip/distributed/partition_kernels.hpp.inc" - - -} // namespace partition -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/distributed/vector_kernels.hip.cpp b/hip/distributed/vector_kernels.hip.cpp deleted file mode 100644 index eff7936076d..00000000000 --- a/hip/distributed/vector_kernels.hip.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/vector_kernels.hpp" - -#include -#include -#include -#include -#include -#include - -#include - -#include "hip/base/thrust.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace distributed_vector { - - -#include "common/cuda_hip/distributed/vector_kernels.hpp.inc" - - -} // namespace distributed_vector -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/factorization/cholesky_kernels.hip.cpp b/hip/factorization/cholesky_kernels.hip.cpp deleted file mode 100644 index 1c1ce1d3170..00000000000 --- a/hip/factorization/cholesky_kernels.hip.cpp +++ /dev/null @@ -1,111 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/cholesky_kernels.hpp" - -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -#include "common/cuda_hip/base/sparselib_bindings.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/components/format_conversion_kernels.hpp" -#include "core/factorization/elimination_forest.hpp" -#include "core/factorization/lu_kernels.hpp" -#include "core/matrix/csr_lookup.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/syncfree.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Cholesky namespace. - * - * @ingroup factor - */ -namespace cholesky { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/factorization/cholesky_kernels.hpp.inc" - - -template -void symbolic_count(std::shared_ptr exec, - const matrix::Csr* mtx, - const factorization::elimination_forest& forest, - IndexType* row_nnz, array& tmp_storage) -{ - const auto num_rows = static_cast(mtx->get_size()[0]); - if (num_rows == 0) { - return; - } - const auto mtx_nnz = static_cast(mtx->get_num_stored_elements()); - tmp_storage.resize_and_reset(mtx_nnz + num_rows); - const auto postorder_cols = tmp_storage.get_data(); - const auto lower_ends = postorder_cols + mtx_nnz; - const auto row_ptrs = mtx->get_const_row_ptrs(); - const auto cols = mtx->get_const_col_idxs(); - const auto inv_postorder = forest.inv_postorder.get_const_data(); - const auto postorder_parent = forest.postorder_parents.get_const_data(); - // transform col indices to postorder indices - { - const auto num_blocks = ceildiv(num_rows, default_block_size); - kernel::build_postorder_cols<<get_stream()>>>( - num_rows, cols, row_ptrs, inv_postorder, postorder_cols, - lower_ends); - } - // sort postorder_cols inside rows - { - const auto handle = exec->get_sparselib_handle(); - auto descr = sparselib::create_mat_descr(); - array permutation_array(exec, mtx_nnz); - auto permutation = permutation_array.get_data(); - components::fill_seq_array(exec, permutation, mtx_nnz); - size_type buffer_size{}; - sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz, - row_ptrs, postorder_cols, buffer_size); - array buffer_array{exec, buffer_size}; - auto buffer = buffer_array.get_data(); - sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs, - postorder_cols, permutation, buffer); - sparselib::destroy(descr); - } - // count nonzeros per row of L - { - const auto num_blocks = - ceildiv(num_rows, default_block_size / config::warp_size); - kernel::symbolic_count - <<get_stream()>>>( - num_rows, row_ptrs, lower_ends, inv_postorder, postorder_cols, - postorder_parent, row_nnz); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT); - - -} // namespace cholesky -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/factorization/factorization_kernels.hip.cpp b/hip/factorization/factorization_kernels.hip.cpp deleted file mode 100644 index d6768e5e9c6..00000000000 --- a/hip/factorization/factorization_kernels.hip.cpp +++ /dev/null @@ -1,41 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/factorization_kernels.hpp" - -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/array_access.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/csr_builder.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/searching.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The factorization namespace. - * - * @ingroup factor - */ -namespace factorization { - - -constexpr int default_block_size{512}; - - -#include "common/cuda_hip/factorization/factorization_kernels.hpp.inc" - - -} // namespace factorization -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/factorization/lu_kernels.hip.cpp b/hip/factorization/lu_kernels.hip.cpp deleted file mode 100644 index 8e37d1a2445..00000000000 --- a/hip/factorization/lu_kernels.hip.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/lu_kernels.hpp" - -#include -#include - -#include -#include -#include - -#include - -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/allocator.hpp" -#include "core/matrix/csr_lookup.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/syncfree.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The LU namespace. - * - * @ingroup factor - */ -namespace lu_factorization { - - -constexpr static int default_block_size = 512; - - -#include "common/cuda_hip/factorization/lu_kernels.hpp.inc" - - -} // namespace lu_factorization -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/factorization/par_ic_kernels.hip.cpp b/hip/factorization/par_ic_kernels.hip.cpp deleted file mode 100644 index f0e0cb0b632..00000000000 --- a/hip/factorization/par_ic_kernels.hip.cpp +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ic_kernels.hpp" - -#include -#include -#include - -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/memory.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The parallel ic factorization namespace. - * - * @ingroup factor - */ -namespace par_ic_factorization { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/factorization/par_ic_kernels.hpp.inc" - - -} // namespace par_ic_factorization -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp deleted file mode 100644 index b4897a23cf9..00000000000 --- a/hip/factorization/par_ilu_kernels.hip.cpp +++ /dev/null @@ -1,36 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ilu_kernels.hpp" - -#include - -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/memory.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The parallel ilu factorization namespace. - * - * @ingroup factor - */ -namespace par_ilu_factorization { - - -constexpr int default_block_size{512}; - - -#include "common/cuda_hip/factorization/par_ilu_kernels.hpp.inc" - - -} // namespace par_ilu_factorization -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/log/batch_logger.hip.hpp b/hip/log/batch_logger.hip.hpp deleted file mode 100644 index a2540f2bd9d..00000000000 --- a/hip/log/batch_logger.hip.hpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_ -#define GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_ - - -#include - - -namespace gko { -namespace kernels { -namespace hip { -namespace batch_log { - -#include "common/cuda_hip/log/batch_logger.hpp.inc" - - -} // namespace batch_log -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_ diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp deleted file mode 100644 index 4b0e6799834..00000000000 --- a/hip/matrix/batch_csr_kernels.hip.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_csr_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Csr matrix format namespace. - * @ref Csr - * @ingroup batch_csr - */ -namespace batch_csr { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_csr -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp deleted file mode 100644 index 328f268251f..00000000000 --- a/hip/matrix/batch_dense_kernels.hip.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_dense_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Dense matrix format namespace. - * - * @ingroup batch_dense - */ -namespace batch_dense { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" - - -// clang-format on - - -} // namespace batch_dense -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp deleted file mode 100644 index 01294ac3d63..00000000000 --- a/hip/matrix/batch_ell_kernels.hip.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_ell_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Ell matrix format namespace. - * @ref Ell - * @ingroup batch_ell - */ -namespace batch_ell { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_ell -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp deleted file mode 100644 index fe78b938e3c..00000000000 --- a/hip/matrix/coo_kernels.hip.cpp +++ /dev/null @@ -1,47 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/coo_kernels.hpp" - -#include -#include -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/sparselib_bindings.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/format_conversion.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/segment_scan.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Coordinate matrix format namespace. - * - * @ingroup coo - */ -namespace coo { - - -constexpr int warps_in_block = 4; -constexpr int spmv_block_size = warps_in_block * config::warp_size; - - -#include "common/cuda_hip/matrix/coo_kernels.hpp.inc" - - -} // namespace coo -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp deleted file mode 100644 index 82599050719..00000000000 --- a/hip/matrix/dense_kernels.hip.cpp +++ /dev/null @@ -1,230 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/dense_kernels.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common/cuda_hip/base/blas_bindings.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/pointer_mode_guard.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/utils.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Dense matrix format namespace. - * - * @ingroup dense - */ -namespace dense { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/matrix/dense_kernels.hpp.inc" - - -template -void compute_dot_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result, array& tmp) -{ - if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - blas::dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), y->get_stride(), - result->get_values()); - } else { - compute_dot(exec, x, y, result, tmp); - } - } else { - compute_dot(exec, x, y, result, tmp); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); - - -template -void compute_conj_dot_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result, - array& tmp) -{ - if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - blas::conj_dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), - y->get_stride(), result->get_values()); - } else { - compute_conj_dot(exec, x, y, result, tmp); - } - } else { - compute_conj_dot(exec, x, y, result, tmp); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); - - -template -void compute_norm2_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result, - array& tmp) -{ - if (x->get_size()[1] == 1) { - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - blas::norm2(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), result->get_values()); - } else { - compute_norm2(exec, x, result, tmp); - } - } else { - compute_norm2(exec, x, result, tmp); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); - - -template -void simple_apply(std::shared_ptr exec, - const matrix::Dense* a, - const matrix::Dense* b, - matrix::Dense* c) -{ - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { - if (a->get_size()[1] > 0) { - blas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1], - c->get_size()[0], a->get_size()[1], &alpha, - b->get_const_values(), b->get_stride(), - a->get_const_values(), a->get_stride(), &beta, - c->get_values(), c->get_stride()); - } else { - dense::fill(exec, c, zero()); - } - } - } else { - GKO_NOT_IMPLEMENTED; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); - - -template -void apply(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Dense* a, const matrix::Dense* b, - const matrix::Dense* beta, matrix::Dense* c) -{ - if (blas::is_supported::value) { - if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { - if (a->get_size()[1] > 0) { - blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N, - c->get_size()[1], c->get_size()[0], a->get_size()[1], - alpha->get_const_values(), b->get_const_values(), - b->get_stride(), a->get_const_values(), - a->get_stride(), beta->get_const_values(), - c->get_values(), c->get_stride()); - } else { - dense::scale(exec, beta, c); - } - } - } else { - GKO_NOT_IMPLEMENTED; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); - - -template -void transpose(std::shared_ptr exec, - const matrix::Dense* orig, - matrix::Dense* trans) -{ - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { - blas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0], - orig->get_size()[1], &alpha, orig->get_const_values(), - orig->get_stride(), &beta, trans->get_const_values(), - trans->get_stride(), trans->get_values(), - trans->get_stride()); - } - } else { - GKO_NOT_IMPLEMENTED; - } -}; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); - - -template -void conj_transpose(std::shared_ptr exec, - const matrix::Dense* orig, - matrix::Dense* trans) -{ - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { - blas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0], - orig->get_size()[1], &alpha, orig->get_const_values(), - orig->get_stride(), &beta, trans->get_const_values(), - trans->get_stride(), trans->get_values(), - trans->get_stride()); - } - } else { - GKO_NOT_IMPLEMENTED; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); - - -} // namespace dense -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/diagonal_kernels.hip.cpp b/hip/matrix/diagonal_kernels.hip.cpp deleted file mode 100644 index b9585db9b41..00000000000 --- a/hip/matrix/diagonal_kernels.hip.cpp +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/diagonal_kernels.hpp" - -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Diagonal matrix format namespace. - * - * @ingroup diagonal - */ -namespace diagonal { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/matrix/diagonal_kernels.hpp.inc" - - -} // namespace diagonal -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp deleted file mode 100644 index cb8cca32d89..00000000000 --- a/hip/matrix/ell_kernels.hip.cpp +++ /dev/null @@ -1,270 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/ell_kernels.hpp" - -#include - -#include -#include -#include -#include -#include - -#include "accessor/cuda_hip_helper.hpp" -#include "accessor/reduced_row_major.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/sparselib_bindings.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/format_conversion.hpp" -#include "core/base/mixed_precision_types.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The ELL matrix format namespace. - * - * @ingroup ell - */ -namespace ell { - - -constexpr int default_block_size = 512; - - -// TODO: num_threads_per_core and ratio are parameters should be tuned -/** - * num_threads_per_core is the oversubscribing parameter. There are - * `num_threads_per_core` threads assigned to each physical core. - */ -constexpr int num_threads_per_core = 4; - - -/** - * ratio is the parameter to decide when to use threads to do reduction on each - * row. (#cols/#rows > ratio) - */ -constexpr double ratio = 1e-2; - - -/** - * max_thread_per_worker is the max number of thread per worker. The - * `compiled_kernels` must be a list <0, 1, 2, ..., max_thread_per_worker> - */ -constexpr int max_thread_per_worker = 32; - - -/** - * A compile-time list of sub-warp sizes for which the spmv kernels should be - * compiled. - * 0 is a special case where it uses a sub-warp size of warp_size in - * combination with atomic_adds. - */ -using compiled_kernels = syn::value_list; - - -#include "common/cuda_hip/matrix/ell_kernels.hpp.inc" - - -namespace { - - -template -void abstract_spmv(syn::value_list, - std::shared_ptr exec, - int num_worker_per_row, - const matrix::Ell* a, - const matrix::Dense* b, - matrix::Dense* c, - const matrix::Dense* alpha = nullptr, - const matrix::Dense* beta = nullptr) -{ - using arithmetic_type = - highest_precision; - using a_accessor = - acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>; - using b_accessor = - acc::reduced_row_major<2, arithmetic_type, const InputValueType>; - - const auto nrows = a->get_size()[0]; - const auto stride = a->get_stride(); - const auto num_stored_elements_per_row = - a->get_num_stored_elements_per_row(); - - constexpr int num_thread_per_worker = - (info == 0) ? max_thread_per_worker : info; - constexpr bool atomic = (info == 0); - const dim3 block_size(default_block_size / num_thread_per_worker, - num_thread_per_worker, 1); - const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x), - b->get_size()[1], 1); - - const auto a_vals = acc::range( - std::array{{static_cast( - num_stored_elements_per_row * stride)}}, - a->get_const_values()); - const auto b_vals = acc::range( - std::array{ - {static_cast(b->get_size()[0]), - static_cast(b->get_size()[1])}}, - b->get_const_values(), - std::array{ - {static_cast(b->get_stride())}}); - - if (alpha == nullptr && beta == nullptr) { - if (grid_size.x > 0 && grid_size.y > 0) { - kernel::spmv - <<get_stream()>>>( - nrows, num_worker_per_row, acc::as_device_range(a_vals), - a->get_const_col_idxs(), stride, - num_stored_elements_per_row, acc::as_device_range(b_vals), - as_device_type(c->get_values()), c->get_stride()); - } - } else if (alpha != nullptr && beta != nullptr) { - const auto alpha_val = acc::range( - std::array{1}, alpha->get_const_values()); - if (grid_size.x > 0 && grid_size.y > 0) { - kernel::spmv - <<get_stream()>>>( - nrows, num_worker_per_row, acc::as_device_range(alpha_val), - acc::as_device_range(a_vals), a->get_const_col_idxs(), - stride, num_stored_elements_per_row, - acc::as_device_range(b_vals), - as_device_type(beta->get_const_values()), - as_device_type(c->get_values()), c->get_stride()); - } - } else { - GKO_KERNEL_NOT_FOUND; - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_abstract_spmv, abstract_spmv); - - -template -std::array compute_thread_worker_and_atomicity( - std::shared_ptr exec, - const matrix::Ell* a) -{ - int num_thread_per_worker = 1; - int atomic = 0; - int num_worker_per_row = 1; - - const auto nrows = a->get_size()[0]; - const auto ell_ncols = a->get_num_stored_elements_per_row(); - // TODO: num_threads_per_core should be tuned for AMD gpu - const auto nwarps = exec->get_num_warps_per_sm() * - exec->get_num_multiprocessor() * num_threads_per_core; - - // Use multithreads to perform the reduction on each row when the matrix is - // wide. - // To make every thread have computation, so pick the value which is the - // power of 2 less than max_thread_per_worker and is less than or equal to - // ell_ncols. If the num_thread_per_worker is max_thread_per_worker and - // allow more than one worker to work on the same row, use atomic add to - // handle the worker write the value into the same position. The #worker is - // decided according to the number of worker allowed on GPU. - if (static_cast(ell_ncols) / nrows > ratio) { - while (num_thread_per_worker < max_thread_per_worker && - (num_thread_per_worker << 1) <= ell_ncols) { - num_thread_per_worker <<= 1; - } - if (num_thread_per_worker == max_thread_per_worker) { - num_worker_per_row = - std::min(ell_ncols / max_thread_per_worker, nwarps / nrows); - num_worker_per_row = std::max(num_worker_per_row, 1); - } - if (num_worker_per_row > 1) { - atomic = 1; - } - } - return {num_thread_per_worker, atomic, num_worker_per_row}; -} - - -} // namespace - - -template -void spmv(std::shared_ptr exec, - const matrix::Ell* a, - const matrix::Dense* b, - matrix::Dense* c) -{ - const auto data = compute_thread_worker_and_atomicity(exec, a); - const int num_thread_per_worker = std::get<0>(data); - const int atomic = std::get<1>(data); - const int num_worker_per_row = std::get<2>(data); - - /** - * info is the parameter for selecting the device kernel. - * for info == 0, it uses the kernel by warp_size threads with atomic - * operation for other value, it uses the kernel without atomic_add - */ - const int info = (!atomic) * num_thread_per_worker; - if (atomic) { - dense::fill(exec, c, zero()); - } - select_abstract_spmv( - compiled_kernels(), - [&info](int compiled_info) { return info == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, num_worker_per_row, a, - b, c); -} - -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_SPMV_KERNEL); - - -template -void advanced_spmv(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Ell* a, - const matrix::Dense* b, - const matrix::Dense* beta, - matrix::Dense* c) -{ - const auto data = compute_thread_worker_and_atomicity(exec, a); - const int num_thread_per_worker = std::get<0>(data); - const int atomic = std::get<1>(data); - const int num_worker_per_row = std::get<2>(data); - - /** - * info is the parameter for selecting the device kernel. - * for info == 0, it uses the kernel by warp_size threads with atomic - * operation for other value, it uses the kernel without atomic_add - */ - const int info = (!atomic) * num_thread_per_worker; - if (atomic) { - dense::scale(exec, beta, c); - } - select_abstract_spmv( - compiled_kernels(), - [&info](int compiled_info) { return info == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, num_worker_per_row, a, - b, c, alpha, beta); -} - -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); - - -} // namespace ell -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/fbcsr_kernels.template.hip.cpp b/hip/matrix/fbcsr_kernels.template.hip.cpp deleted file mode 100644 index c5d49215042..00000000000 --- a/hip/matrix/fbcsr_kernels.template.hip.cpp +++ /dev/null @@ -1,299 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/fbcsr_kernels.hpp" - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "common/cuda_hip/base/blas_bindings.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/pointer_mode_guard.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/sparselib_bindings.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/unified/base/kernel_launch.hpp" -#include "core/base/array_access.hpp" -#include "core/base/block_sizes.hpp" -#include "core/base/device_matrix_data_kernels.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/components/format_conversion_kernels.hpp" -#include "core/matrix/csr_lookup.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/hipsparse_block_bindings.hip.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/merging.hip.hpp" -#include "hip/components/prefix_sum.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -/** - * @brief The fixed-size block compressed sparse row matrix format namespace. - * - * @ingroup fbcsr - */ -namespace fbcsr { - - -constexpr int default_block_size{512}; - - -#include "common/cuda_hip/matrix/csr_common.hpp.inc" -#include "common/cuda_hip/matrix/fbcsr_kernels.hpp.inc" - - -namespace { - - -template -void dense_transpose(std::shared_ptr exec, - const size_type nrows, const size_type ncols, - const size_type orig_stride, const ValueType* const orig, - const size_type trans_stride, ValueType* const trans) -{ - if (nrows == 0) { - return; - } - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - { - blas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig, - orig_stride, &beta, trans, trans_stride, trans, - trans_stride); - } - } else { - GKO_NOT_IMPLEMENTED; - } -} - - -} // namespace - - -template -void spmv(std::shared_ptr exec, - const matrix::Fbcsr* const a, - const matrix::Dense* const b, - matrix::Dense* const c) -{ - if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { - // empty output: nothing to do - return; - } - if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { - // empty input: fill output with zero - dense::fill(exec, c, zero()); - return; - } - if (sparselib::is_supported::value) { - auto handle = exec->get_sparselib_handle(); - sparselib::pointer_mode_guard pm_guard(handle); - const auto alpha = one(); - const auto beta = zero(); - auto descr = sparselib::create_mat_descr(); - const auto row_ptrs = a->get_const_row_ptrs(); - const auto col_idxs = a->get_const_col_idxs(); - const auto values = a->get_const_values(); - const int bs = a->get_block_size(); - const IndexType mb = a->get_num_block_rows(); - const IndexType nb = a->get_num_block_cols(); - const auto nnzb = static_cast(a->get_num_stored_blocks()); - const auto nrhs = static_cast(b->get_size()[1]); - const auto nrows = a->get_size()[0]; - const auto ncols = a->get_size()[1]; - const auto in_stride = b->get_stride(); - const auto out_stride = c->get_stride(); - if (nrhs == 1 && in_stride == 1 && out_stride == 1) { - sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, - nnzb, &alpha, descr, values, row_ptrs, col_idxs, - bs, b->get_const_values(), &beta, c->get_values()); - } else { - const auto trans_stride = nrows; - auto trans_c = array(exec, nrows * nrhs); - sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, - SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, - &alpha, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), in_stride, &beta, - trans_c.get_data(), trans_stride); - dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), - out_stride, c->get_values()); - } - sparselib::destroy(descr); - } else { - GKO_NOT_IMPLEMENTED; - } -} - - -template -void advanced_spmv(std::shared_ptr exec, - const matrix::Dense* const alpha, - const matrix::Fbcsr* const a, - const matrix::Dense* const b, - const matrix::Dense* const beta, - matrix::Dense* const c) -{ - if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { - // empty output: nothing to do - return; - } - if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { - // empty input: scale output - dense::scale(exec, beta, c); - return; - } - if (sparselib::is_supported::value) { - auto handle = exec->get_sparselib_handle(); - const auto alphp = alpha->get_const_values(); - const auto betap = beta->get_const_values(); - auto descr = sparselib::create_mat_descr(); - const auto row_ptrs = a->get_const_row_ptrs(); - const auto col_idxs = a->get_const_col_idxs(); - const auto values = a->get_const_values(); - const int bs = a->get_block_size(); - const IndexType mb = a->get_num_block_rows(); - const IndexType nb = a->get_num_block_cols(); - const auto nnzb = static_cast(a->get_num_stored_blocks()); - const auto nrhs = static_cast(b->get_size()[1]); - const auto nrows = a->get_size()[0]; - const auto ncols = a->get_size()[1]; - const auto in_stride = b->get_stride(); - const auto out_stride = c->get_stride(); - if (nrhs == 1 && in_stride == 1 && out_stride == 1) { - sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, - nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), betap, c->get_values()); - } else { - const auto trans_stride = nrows; - auto trans_c = array(exec, nrows * nrhs); - dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(), - trans_stride, trans_c.get_data()); - sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, - SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, - alphp, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), in_stride, betap, - trans_c.get_data(), trans_stride); - dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), - out_stride, c->get_values()); - } - sparselib::destroy(descr); - } else { - GKO_NOT_IMPLEMENTED; - } -} - - -namespace { - - -template -void transpose_blocks_impl(syn::value_list, - std::shared_ptr exec, - matrix::Fbcsr* const mat) -{ - constexpr int subwarp_size = config::warp_size; - const auto nbnz = mat->get_num_stored_blocks(); - const auto numthreads = nbnz * subwarp_size; - const auto block_size = default_block_size; - const auto grid_dim = ceildiv(numthreads, block_size); - if (grid_dim > 0) { - kernel::transpose_blocks - <<get_stream()>>>( - nbnz, mat->get_values()); - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks, - transpose_blocks_impl); - - -} // namespace - - -template -void transpose(const std::shared_ptr exec, - const matrix::Fbcsr* const orig, - matrix::Fbcsr* const trans) -{ -#ifdef GKO_COMPILING_CUDA - if (sparselib::is_supported::value) { - const int bs = orig->get_block_size(); - const IndexType nnzb = - static_cast(orig->get_num_stored_blocks()); - cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; - cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; - const IndexType buffer_size = sparselib::bsr_transpose_buffersize( - exec->get_sparselib_handle(), orig->get_num_block_rows(), - orig->get_num_block_cols(), nnzb, orig->get_const_values(), - orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs); - array buffer_array(exec, buffer_size); - auto buffer = buffer_array.get_data(); - sparselib::bsr_transpose( - exec->get_sparselib_handle(), orig->get_num_block_rows(), - orig->get_num_block_cols(), nnzb, orig->get_const_values(), - orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs, - trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(), - copyValues, idxBase, buffer); - - // transpose blocks - select_transpose_blocks( - fixedblock::compiled_kernels(), - [bs](int compiled_block_size) { return bs == compiled_block_size; }, - syn::value_list(), syn::type_list<>(), exec, trans); - } else -#endif - { - fallback_transpose(exec, orig, trans); - } -} - - -template -void conj_transpose(std::shared_ptr exec, - const matrix::Fbcsr* orig, - matrix::Fbcsr* trans) -{ - const int grid_size = - ceildiv(trans->get_num_stored_elements(), default_block_size); - transpose(exec, orig, trans); - if (grid_size > 0 && is_complex()) { - kernel:: - conjugate<<get_stream()>>>( - trans->get_num_stored_elements(), - as_device_type(trans->get_values())); - } -} - - -} // namespace fbcsr -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp deleted file mode 100644 index 4caf83fdaa1..00000000000 --- a/hip/matrix/sellp_kernels.hip.cpp +++ /dev/null @@ -1,42 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/sellp_kernels.hpp" - -#include -#include -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/sparselib_bindings.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The SELL-P matrix format namespace. - * - * @ingroup sellp - */ -namespace sellp { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/matrix/sellp_kernels.hpp.inc" - - -} // namespace sellp -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/multigrid/pgm_kernels.hip.cpp b/hip/multigrid/pgm_kernels.hip.cpp deleted file mode 100644 index da5890315bc..00000000000 --- a/hip/multigrid/pgm_kernels.hip.cpp +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/multigrid/pgm_kernels.hpp" - -#include - -#include -#include -#include -#include -#include - -#include -#include - -#include "common/cuda_hip/base/types.hpp" -#include "hip/base/thrust.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The PGM solver namespace. - * - * @ingroup pgm - */ -namespace pgm { - - -#include "common/cuda_hip/multigrid/pgm_kernels.hpp.inc" - - -} // namespace pgm -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/preconditioner/isai_kernels.hip.cpp b/hip/preconditioner/isai_kernels.hip.cpp deleted file mode 100644 index d3c2bd0fb1d..00000000000 --- a/hip/preconditioner/isai_kernels.hip.cpp +++ /dev/null @@ -1,47 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/preconditioner/isai_kernels.hpp" - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/csr_builder.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/merging.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" -#include "hip/components/warp_blas.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Isai preconditioner namespace. - * @ref Isai - * @ingroup isai - */ -namespace isai { - - -constexpr int subwarp_size{row_size_limit}; -constexpr int subwarps_per_block{2}; -constexpr int default_block_size{subwarps_per_block * subwarp_size}; - - -#include "common/cuda_hip/preconditioner/isai_kernels.hpp.inc" - - -} // namespace isai -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp deleted file mode 100644 index 122e53f636d..00000000000 --- a/hip/preconditioner/jacobi_kernels.hip.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/preconditioner/jacobi_kernels.hpp" - -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/extended_float.hpp" -#include "core/preconditioner/jacobi_utils.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/preconditioner/jacobi_common.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Jacobi preconditioner namespace. - * @ref Jacobi - * @ingroup jacobi - */ -namespace jacobi { - - -// a total of 32/16 warps (1024 threads) -#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC -constexpr int default_num_warps = 16; -#else // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC -constexpr int default_num_warps = 32; -#endif -// with current architectures, at most 32 warps can be scheduled per SM (and -// current GPUs have at most 84 SMs) -constexpr int default_grid_size = 32 * 32 * 128; - - -#include "common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc" - - -} // namespace jacobi -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/reorder/rcm_kernels.hip.cpp b/hip/reorder/rcm_kernels.hip.cpp deleted file mode 100644 index 9ac6e44e173..00000000000 --- a/hip/reorder/rcm_kernels.hip.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/reorder/rcm_kernels.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "common/cuda_hip/components/memory.hpp" -#include "core/base/array_access.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The reordering namespace. - * - * @ingroup reorder - */ -namespace rcm { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/reorder/rcm_kernels.hpp.inc" - - -} // namespace rcm -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp deleted file mode 100644 index fd046d000b4..00000000000 --- a/hip/solver/cb_gmres_kernels.hip.cpp +++ /dev/null @@ -1,504 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/solver/cb_gmres_kernels.hpp" - -#include - -#include -#include -#include -#include - -#include "accessor/cuda_hip_helper.hpp" -#include "accessor/range.hpp" -#include "accessor/reduced_row_major.hpp" -#include "accessor/scaled_reduced_row_major.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/base/array_access.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "core/solver/cb_gmres_accessor.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The CB_GMRES solver namespace. - * - * @ingroup cb_gmres - */ -namespace cb_gmres { - - -constexpr int default_block_size = 512; -// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block -// size limit. -constexpr int default_dot_dim = 32; -constexpr int default_dot_size = default_dot_dim * default_dot_dim; - - -#include "common/cuda_hip/solver/cb_gmres_kernels.hpp.inc" - - -template -void zero_matrix(std::shared_ptr exec, size_type m, - size_type n, size_type stride, ValueType* array) -{ - const auto block_size = default_block_size; - const auto grid_size = ceildiv(n, block_size); - zero_matrix_kernel<<get_stream()>>>( - m, n, stride, as_device_type(array)); -} - - -template -void initialize(std::shared_ptr exec, - const matrix::Dense* b, - matrix::Dense* residual, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - array* stop_status, size_type krylov_dim) -{ - const auto num_threads = std::max(b->get_size()[0] * b->get_stride(), - krylov_dim * b->get_size()[1]); - const auto grid_dim = ceildiv(num_threads, default_block_size); - const auto block_dim = default_block_size; - constexpr auto block_size = default_block_size; - - initialize_kernel - <<get_stream()>>>( - b->get_size()[0], b->get_size()[1], krylov_dim, - as_device_type(b->get_const_values()), b->get_stride(), - as_device_type(residual->get_values()), residual->get_stride(), - as_device_type(givens_sin->get_values()), givens_sin->get_stride(), - as_device_type(givens_cos->get_values()), givens_cos->get_stride(), - as_device_type(stop_status->get_data())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); - - -template -void restart(std::shared_ptr exec, - const matrix::Dense* residual, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense>* arnoldi_norm, - Accessor3d krylov_bases, - matrix::Dense* next_krylov_basis, - array* final_iter_nums, array& reduction_tmp, - size_type krylov_dim) -{ - constexpr bool use_scalar = - gko::cb_gmres::detail::has_3d_scaled_accessor::value; - const auto num_rows = residual->get_size()[0]; - const auto num_rhs = residual->get_size()[1]; - const auto krylov_stride = - gko::cb_gmres::helper_functions_accessor::get_stride( - krylov_bases); - const auto grid_dim_1 = - ceildiv((krylov_dim + 1) * krylov_stride[0], default_block_size); - const auto block_dim = default_block_size; - constexpr auto block_size = default_block_size; - const auto stride_arnoldi = arnoldi_norm->get_stride(); - - restart_1_kernel - <<get_stream()>>>( - residual->get_size()[0], residual->get_size()[1], krylov_dim, - acc::as_device_range(krylov_bases), - as_device_type(residual_norm_collection->get_values()), - residual_norm_collection->get_stride()); - kernels::hip::dense::compute_norm2_dispatch(exec, residual, residual_norm, - reduction_tmp); - - if (use_scalar) { - components::fill_array(exec, - arnoldi_norm->get_values() + 2 * stride_arnoldi, - num_rhs, zero>()); - const dim3 grid_size_nrm(ceildiv(num_rhs, default_dot_dim), - exec->get_num_multiprocessor() * 2); - const dim3 block_size_nrm(default_dot_dim, default_dot_dim); - multinorminf_without_stop_kernel<<get_stream()>>>( - num_rows, num_rhs, as_device_type(residual->get_const_values()), - residual->get_stride(), - as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), 0); - } - - if (gko::cb_gmres::detail::has_3d_scaled_accessor::value) { - set_scalar_kernel - <<get_stream()>>>( - num_rhs, krylov_dim + 1, - as_device_type(residual_norm->get_const_values()), - residual_norm->get_stride(), - as_device_type(arnoldi_norm->get_const_values() + - 2 * stride_arnoldi), - stride_arnoldi, acc::as_device_range(krylov_bases)); - } - - const auto grid_dim_2 = - ceildiv(std::max(num_rows, 1) * krylov_stride[1], - default_block_size); - restart_2_kernel - <<get_stream()>>>( - residual->get_size()[0], residual->get_size()[1], - as_device_type(residual->get_const_values()), - residual->get_stride(), - as_device_type(residual_norm->get_const_values()), - as_device_type(residual_norm_collection->get_values()), - acc::as_device_range(krylov_bases), - as_device_type(next_krylov_basis->get_values()), - next_krylov_basis->get_stride(), - as_device_type(final_iter_nums->get_data())); -} - -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL); - - -template -void finish_arnoldi_CGS(std::shared_ptr exec, - matrix::Dense* next_krylov_basis, - Accessor3dim krylov_bases, - matrix::Dense* hessenberg_iter, - matrix::Dense* buffer_iter, - matrix::Dense>* arnoldi_norm, - size_type iter, const stopping_status* stop_status, - stopping_status* reorth_status, - array* num_reorth) -{ - const auto dim_size = next_krylov_basis->get_size(); - if (dim_size[1] == 0) { - return; - } - using non_complex = remove_complex; - // optimization parameter - constexpr int singledot_block_size = default_dot_dim; - constexpr bool use_scalar = - gko::cb_gmres::detail::has_3d_scaled_accessor::value; - const auto stride_next_krylov = next_krylov_basis->get_stride(); - const auto stride_hessenberg = hessenberg_iter->get_stride(); - const auto stride_buffer = buffer_iter->get_stride(); - const auto stride_arnoldi = arnoldi_norm->get_stride(); - const dim3 grid_size(ceildiv(dim_size[1], default_dot_dim), - exec->get_num_multiprocessor() * 2); - const dim3 grid_size_num_iters(ceildiv(dim_size[1], default_dot_dim), - exec->get_num_multiprocessor() * 2, - iter + 1); - const dim3 block_size(default_dot_dim, default_dot_dim); - // Note: having iter first (instead of row_idx information) is likely - // beneficial for avoiding atomic_add conflicts, but that needs - // further investigation. - const dim3 grid_size_iters_single(exec->get_num_multiprocessor() * 2, - iter + 1); - const auto block_size_iters_single = singledot_block_size; - size_type num_reorth_host; - - components::fill_array(exec, arnoldi_norm->get_values(), dim_size[1], - zero()); - multinorm2_kernel<<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, as_device_type(arnoldi_norm->get_values()), - as_device_type(stop_status)); - // nrmP = norm(next_krylov_basis) - zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg, - hessenberg_iter->get_values()); - if (dim_size[1] > 1) { - multidot_kernel - <<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_device_range(krylov_bases), - as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, as_device_type(stop_status)); - } else { - singledot_kernel - <<get_stream()>>>( - dim_size[0], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_device_range(krylov_bases), - as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, as_device_type(stop_status)); - } - // for i in 1:iter - // hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i) - // end - update_next_krylov_kernel - <<get_stream()>>>( - iter + 1, dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_values()), stride_next_krylov, - acc::as_device_range(krylov_bases), - as_device_type(hessenberg_iter->get_const_values()), - stride_hessenberg, as_device_type(stop_status)); - - // for i in 1:iter - // next_krylov_basis -= hessenberg(iter, i) * krylov_bases(:, i) - // end - components::fill_array(exec, arnoldi_norm->get_values() + stride_arnoldi, - dim_size[1], zero()); - if (use_scalar) { - components::fill_array(exec, - arnoldi_norm->get_values() + 2 * stride_arnoldi, - dim_size[1], zero()); - } - multinorm2_inf_kernel - <<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, - as_device_type(arnoldi_norm->get_values() + stride_arnoldi), - as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), - as_device_type(stop_status)); - // nrmN = norm(next_krylov_basis) - components::fill_array(exec, num_reorth->get_data(), 1, zero()); - check_arnoldi_norms - <<get_stream()>>>( - dim_size[1], as_device_type(arnoldi_norm->get_values()), - stride_arnoldi, as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases), - as_device_type(stop_status), as_device_type(reorth_status), - as_device_type(num_reorth->get_data())); - num_reorth_host = get_element(*num_reorth, 0); - // num_reorth_host := number of next_krylov vector to be reorthogonalization - for (size_type l = 1; (num_reorth_host > 0) && (l < 3); l++) { - zero_matrix(exec, iter + 1, dim_size[1], stride_buffer, - buffer_iter->get_values()); - if (dim_size[1] > 1) { - multidot_kernel - <<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_device_range(krylov_bases), - as_device_type(buffer_iter->get_values()), stride_buffer, - as_device_type(stop_status)); - } else { - singledot_kernel - <<get_stream()>>>( - dim_size[0], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_device_range(krylov_bases), - as_device_type(buffer_iter->get_values()), stride_buffer, - as_device_type(stop_status)); - } - // for i in 1:iter - // hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i) - // end - update_next_krylov_and_add_kernel - <<get_stream()>>>( - iter + 1, dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_values()), - stride_next_krylov, acc::as_device_range(krylov_bases), - as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, - as_device_type(buffer_iter->get_const_values()), stride_buffer, - as_device_type(stop_status), as_device_type(reorth_status)); - // for i in 1:iter - // next_krylov_basis -= hessenberg(iter, i) * krylov_bases(:, i) - // end - components::fill_array(exec, - arnoldi_norm->get_values() + stride_arnoldi, - dim_size[1], zero()); - if (use_scalar) { - components::fill_array( - exec, arnoldi_norm->get_values() + 2 * stride_arnoldi, - dim_size[1], zero()); - } - multinorm2_inf_kernel - <<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, - as_device_type(arnoldi_norm->get_values() + stride_arnoldi), - as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), - as_device_type(stop_status)); - // nrmN = norm(next_krylov_basis) - components::fill_array(exec, num_reorth->get_data(), 1, - zero()); - check_arnoldi_norms - <<get_stream()>>>( - dim_size[1], as_device_type(arnoldi_norm->get_values()), - stride_arnoldi, as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases), - as_device_type(stop_status), as_device_type(reorth_status), - num_reorth->get_data()); - num_reorth_host = get_element(*num_reorth, 0); - // num_reorth_host := number of next_krylov vector to be - // reorthogonalization - } - update_krylov_next_krylov_kernel - <<get_stream()>>>( - iter, dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_values()), stride_next_krylov, - acc::as_device_range(krylov_bases), - as_device_type(hessenberg_iter->get_const_values()), - stride_hessenberg, as_device_type(stop_status)); - // next_krylov_basis /= hessenberg(iter, iter + 1) - // krylov_bases(:, iter + 1) = next_krylov_basis - // End of arnoldi -} - -template -void givens_rotation(std::shared_ptr exec, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense* hessenberg_iter, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - size_type iter, const array* stop_status) -{ - // TODO: tune block_size for optimal performance - constexpr auto block_size = default_block_size; - const auto num_cols = hessenberg_iter->get_size()[1]; - const auto block_dim = block_size; - const auto grid_dim = - static_cast(ceildiv(num_cols, block_size)); - - givens_rotation_kernel - <<get_stream()>>>( - hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1], - iter, as_device_type(hessenberg_iter->get_values()), - hessenberg_iter->get_stride(), - as_device_type(givens_sin->get_values()), givens_sin->get_stride(), - as_device_type(givens_cos->get_values()), givens_cos->get_stride(), - as_device_type(residual_norm->get_values()), - as_device_type(residual_norm_collection->get_values()), - residual_norm_collection->get_stride(), - stop_status->get_const_data()); -} - - -template -void arnoldi(std::shared_ptr exec, - matrix::Dense* next_krylov_basis, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - Accessor3d krylov_bases, matrix::Dense* hessenberg_iter, - matrix::Dense* buffer_iter, - matrix::Dense>* arnoldi_norm, - size_type iter, array* final_iter_nums, - const array* stop_status, - array* reorth_status, - array* num_reorth) -{ - increase_final_iteration_numbers_kernel<<< - static_cast( - ceildiv(final_iter_nums->get_size(), default_block_size)), - default_block_size, 0, exec->get_stream()>>>( - as_device_type(final_iter_nums->get_data()), - stop_status->get_const_data(), final_iter_nums->get_size()); - finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter, - buffer_iter, arnoldi_norm, iter, - stop_status->get_const_data(), reorth_status->get_data(), - num_reorth); - givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter, - residual_norm, residual_norm_collection, iter, stop_status); -} - -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL); - - -template -void solve_upper_triangular( - std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - const matrix::Dense* hessenberg, matrix::Dense* y, - const array* final_iter_nums) -{ - // TODO: tune block_size for optimal performance - constexpr auto block_size = default_block_size; - const auto num_rhs = residual_norm_collection->get_size()[1]; - const auto block_dim = block_size; - const auto grid_dim = - static_cast(ceildiv(num_rhs, block_size)); - - solve_upper_triangular_kernel - <<get_stream()>>>( - hessenberg->get_size()[1], num_rhs, - as_device_type(residual_norm_collection->get_const_values()), - residual_norm_collection->get_stride(), - as_device_type(hessenberg->get_const_values()), - hessenberg->get_stride(), as_device_type(y->get_values()), - y->get_stride(), as_device_type(final_iter_nums->get_const_data())); -} - - -template -void calculate_qy(std::shared_ptr exec, - ConstAccessor3d krylov_bases, size_type num_krylov_bases, - const matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) -{ - const auto num_rows = before_preconditioner->get_size()[0]; - const auto num_cols = before_preconditioner->get_size()[1]; - const auto stride_before_preconditioner = - before_preconditioner->get_stride(); - - constexpr auto block_size = default_block_size; - const auto grid_dim = static_cast( - ceildiv(num_rows * stride_before_preconditioner, block_size)); - const auto block_dim = block_size; - - calculate_Qy_kernel - <<get_stream()>>>( - num_rows, num_cols, acc::as_device_range(krylov_bases), - as_device_type(y->get_const_values()), y->get_stride(), - as_device_type(before_preconditioner->get_values()), - stride_before_preconditioner, - as_device_type(final_iter_nums->get_const_data())); - // Calculate qy - // before_preconditioner = krylov_bases * y -} - - -template -void solve_krylov(std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - ConstAccessor3d krylov_bases, - const matrix::Dense* hessenberg, - matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) -{ - if (before_preconditioner->get_size()[1] == 0) { - return; - } - // since hessenberg has dims: iters x iters * num_rhs - // krylov_bases has dims: (iters + 1) x sysmtx[0] x num_rhs - const auto iters = - hessenberg->get_size()[1] / before_preconditioner->get_size()[1]; - const auto num_krylov_bases = iters + 1; - solve_upper_triangular(exec, residual_norm_collection, hessenberg, y, - final_iter_nums); - calculate_qy(exec, krylov_bases, num_krylov_bases, y, before_preconditioner, - final_iter_nums); -} - -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE( - GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL); - - -} // namespace cb_gmres -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp deleted file mode 100644 index c516597bd2b..00000000000 --- a/hip/solver/idr_kernels.hip.cpp +++ /dev/null @@ -1,340 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/solver/idr_kernels.hpp" - -#include -#include - -#include -#include - -#include "common/cuda_hip/base/blas_bindings.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/randlib_bindings.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The IDR solver namespace. - * - * @ingroup idr - */ -namespace idr { - - -constexpr int default_block_size = 512; -constexpr int default_dot_dim = 32; -constexpr int default_dot_size = default_dot_dim * default_dot_dim; - - -#include "common/cuda_hip/solver/idr_kernels.hpp.inc" - - -namespace { - - -template -void initialize_m(std::shared_ptr exec, - const size_type nrhs, matrix::Dense* m, - array* stop_status) -{ - const auto subspace_dim = m->get_size()[0]; - const auto m_stride = m->get_stride(); - - const auto grid_dim = ceildiv(m_stride * subspace_dim, default_block_size); - initialize_m_kernel<<get_stream()>>>( - subspace_dim, nrhs, as_device_type(m->get_values()), m_stride, - as_device_type(stop_status->get_data())); -} - - -template -void initialize_subspace_vectors(std::shared_ptr exec, - matrix::Dense* subspace_vectors, - bool deterministic) -{ - if (!deterministic) { - auto gen = randlib::rand_generator(std::random_device{}(), - RANDLIB_RNG_PSEUDO_DEFAULT, - exec->get_stream()); - randlib::rand_vector( - gen, - subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), - 0.0, 1.0, subspace_vectors->get_values()); - randlib::destroy(gen); - } -} - - -template -void orthonormalize_subspace_vectors( - std::shared_ptr exec, - matrix::Dense* subspace_vectors) -{ - orthonormalize_subspace_vectors_kernel - <<<1, default_block_size, 0, exec->get_stream()>>>( - subspace_vectors->get_size()[0], subspace_vectors->get_size()[1], - as_device_type(subspace_vectors->get_values()), - subspace_vectors->get_stride()); -} - - -template -void solve_lower_triangular(std::shared_ptr exec, - const size_type nrhs, - const matrix::Dense* m, - const matrix::Dense* f, - matrix::Dense* c, - const array* stop_status) -{ - const auto subspace_dim = m->get_size()[0]; - - const auto grid_dim = ceildiv(nrhs, default_block_size); - solve_lower_triangular_kernel<<get_stream()>>>( - subspace_dim, nrhs, as_device_type(m->get_const_values()), - m->get_stride(), as_device_type(f->get_const_values()), f->get_stride(), - as_device_type(c->get_values()), c->get_stride(), - stop_status->get_const_data()); -} - - -template -void update_g_and_u(std::shared_ptr exec, - const size_type nrhs, const size_type k, - const matrix::Dense* p, - const matrix::Dense* m, - matrix::Dense* alpha, - matrix::Dense* g, matrix::Dense* g_k, - matrix::Dense* u, - const array* stop_status) -{ - if (nrhs == 0) { - return; - } - const auto size = g->get_size()[0]; - const auto p_stride = p->get_stride(); - - const dim3 grid_dim(ceildiv(nrhs, default_dot_dim), - exec->get_num_multiprocessor() * 2); - const dim3 block_dim(default_dot_dim, default_dot_dim); - - for (size_type i = 0; i < k; i++) { - const auto p_i = p->get_const_values() + i * p_stride; - if (nrhs > 1 || is_complex()) { - components::fill_array(exec, alpha->get_values(), nrhs, - zero()); - multidot_kernel<<get_stream()>>>( - size, nrhs, as_device_type(p_i), - as_device_type(g_k->get_values()), g_k->get_stride(), - as_device_type(alpha->get_values()), - stop_status->get_const_data()); - } else { - blas::dot(exec->get_blas_handle(), size, p_i, 1, g_k->get_values(), - g_k->get_stride(), alpha->get_values()); - } - update_g_k_and_u_kernel - <<get_stride(), default_block_size), - default_block_size, 0, exec->get_stream()>>>( - k, i, size, nrhs, as_device_type(alpha->get_const_values()), - as_device_type(m->get_const_values()), m->get_stride(), - as_device_type(g->get_const_values()), g->get_stride(), - as_device_type(g_k->get_values()), g_k->get_stride(), - as_device_type(u->get_values()), u->get_stride(), - stop_status->get_const_data()); - } - update_g_kernel - <<get_stride(), default_block_size), - default_block_size, 0, exec->get_stream()>>>( - k, size, nrhs, as_device_type(g_k->get_const_values()), - g_k->get_stride(), as_device_type(g->get_values()), g->get_stride(), - stop_status->get_const_data()); -} - - -template -void update_m(std::shared_ptr exec, const size_type nrhs, - const size_type k, const matrix::Dense* p, - const matrix::Dense* g_k, matrix::Dense* m, - const array* stop_status) -{ - if (nrhs == 0) { - return; - } - const auto size = g_k->get_size()[0]; - const auto subspace_dim = m->get_size()[0]; - const auto p_stride = p->get_stride(); - const auto m_stride = m->get_stride(); - - const dim3 grid_dim(ceildiv(nrhs, default_dot_dim), - exec->get_num_multiprocessor() * 2); - const dim3 block_dim(default_dot_dim, default_dot_dim); - - for (size_type i = k; i < subspace_dim; i++) { - const auto p_i = p->get_const_values() + i * p_stride; - auto m_i = m->get_values() + i * m_stride + k * nrhs; - if (nrhs > 1 || is_complex()) { - components::fill_array(exec, m_i, nrhs, zero()); - multidot_kernel<<get_stream()>>>( - size, nrhs, as_device_type(p_i), - as_device_type(g_k->get_const_values()), g_k->get_stride(), - as_device_type(m_i), stop_status->get_const_data()); - } else { - blas::dot(exec->get_blas_handle(), size, p_i, 1, - g_k->get_const_values(), g_k->get_stride(), m_i); - } - } -} - - -template -void update_x_r_and_f(std::shared_ptr exec, - const size_type nrhs, const size_type k, - const matrix::Dense* m, - const matrix::Dense* g, - const matrix::Dense* u, - matrix::Dense* f, matrix::Dense* r, - matrix::Dense* x, - const array* stop_status) -{ - const auto size = x->get_size()[0]; - const auto subspace_dim = m->get_size()[0]; - - const auto grid_dim = ceildiv(size * x->get_stride(), default_block_size); - update_x_r_and_f_kernel<<get_stream()>>>( - k, size, subspace_dim, nrhs, as_device_type(m->get_const_values()), - m->get_stride(), as_device_type(g->get_const_values()), g->get_stride(), - as_device_type(u->get_const_values()), u->get_stride(), - as_device_type(f->get_values()), f->get_stride(), - as_device_type(r->get_values()), r->get_stride(), - as_device_type(x->get_values()), x->get_stride(), - stop_status->get_const_data()); - components::fill_array(exec, f->get_values() + k * f->get_stride(), nrhs, - zero()); -} - - -} // namespace - - -template -void initialize(std::shared_ptr exec, - const size_type nrhs, matrix::Dense* m, - matrix::Dense* subspace_vectors, bool deterministic, - array* stop_status) -{ - initialize_m(exec, nrhs, m, stop_status); - initialize_subspace_vectors(exec, subspace_vectors, deterministic); - orthonormalize_subspace_vectors(exec, subspace_vectors); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL); - - -template -void step_1(std::shared_ptr exec, const size_type nrhs, - const size_type k, const matrix::Dense* m, - const matrix::Dense* f, - const matrix::Dense* residual, - const matrix::Dense* g, matrix::Dense* c, - matrix::Dense* v, - const array* stop_status) -{ - solve_lower_triangular(exec, nrhs, m, f, c, stop_status); - - const auto num_rows = v->get_size()[0]; - const auto subspace_dim = m->get_size()[0]; - - const auto grid_dim = ceildiv(nrhs * num_rows, default_block_size); - step_1_kernel<<get_stream()>>>( - k, num_rows, subspace_dim, nrhs, - as_device_type(residual->get_const_values()), residual->get_stride(), - as_device_type(c->get_const_values()), c->get_stride(), - as_device_type(g->get_const_values()), g->get_stride(), - as_device_type(v->get_values()), v->get_stride(), - stop_status->get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL); - - -template -void step_2(std::shared_ptr exec, const size_type nrhs, - const size_type k, const matrix::Dense* omega, - const matrix::Dense* preconditioned_vector, - const matrix::Dense* c, matrix::Dense* u, - const array* stop_status) -{ - if (nrhs == 0) { - return; - } - const auto num_rows = preconditioned_vector->get_size()[0]; - const auto subspace_dim = u->get_size()[1] / nrhs; - - const auto grid_dim = ceildiv(nrhs * num_rows, default_block_size); - step_2_kernel<<get_stream()>>>( - k, num_rows, subspace_dim, nrhs, - as_device_type(omega->get_const_values()), - as_device_type(preconditioned_vector->get_const_values()), - preconditioned_vector->get_stride(), - as_device_type(c->get_const_values()), c->get_stride(), - as_device_type(u->get_values()), u->get_stride(), - stop_status->get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL); - - -template -void step_3(std::shared_ptr exec, const size_type nrhs, - const size_type k, const matrix::Dense* p, - matrix::Dense* g, matrix::Dense* g_k, - matrix::Dense* u, matrix::Dense* m, - matrix::Dense* f, matrix::Dense* alpha, - matrix::Dense* residual, matrix::Dense* x, - const array* stop_status) -{ - update_g_and_u(exec, nrhs, k, p, m, alpha, g, g_k, u, stop_status); - update_m(exec, nrhs, k, p, g_k, m, stop_status); - update_x_r_and_f(exec, nrhs, k, m, g, u, f, residual, x, stop_status); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL); - - -template -void compute_omega( - std::shared_ptr exec, const size_type nrhs, - const remove_complex kappa, const matrix::Dense* tht, - const matrix::Dense>* residual_norm, - matrix::Dense* omega, const array* stop_status) -{ - const auto grid_dim = ceildiv(nrhs, config::warp_size); - compute_omega_kernel<<get_stream()>>>( - nrhs, as_device_type(kappa), as_device_type(tht->get_const_values()), - as_device_type(residual_norm->get_const_values()), - as_device_type(omega->get_values()), stop_status->get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); - - -} // namespace idr -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/solver/multigrid_kernels.hip.cpp b/hip/solver/multigrid_kernels.hip.cpp deleted file mode 100644 index 6e19606a78e..00000000000 --- a/hip/solver/multigrid_kernels.hip.cpp +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/solver/multigrid_kernels.hpp" - -#include -#include -#include -#include - -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "core/base/array_access.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The MULTIGRID solver namespace. - * - * @ingroup multigrid - */ -namespace multigrid { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/solver/multigrid_kernels.hpp.inc" - - -} // namespace multigrid -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/stop/batch_criteria.hip.hpp b/hip/stop/batch_criteria.hip.hpp deleted file mode 100644 index 1f721e36aaf..00000000000 --- a/hip/stop/batch_criteria.hip.hpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_ -#define GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_ - - -#include - - -namespace gko { -namespace kernels { -namespace hip { -namespace batch_stop { - - -#include "common/cuda_hip/stop/batch_criteria.hpp.inc" - - -} // namespace batch_stop -} // namespace hip -} // namespace kernels -} // namespace gko - -#endif // GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_ From 403fcc3963911067b1fd9df637e0ca1e7dc82b65 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 29 Jun 2024 15:55:45 +0200 Subject: [PATCH 039/448] unify missing files, CMake changes --- common/CMakeLists.txt | 2 + common/cuda_hip/CMakeLists.txt | 51 +++++++++++++++++++ .../cuda_hip/base/{math.hpp.inc => math.hpp} | 18 +++++++ ..._array.hpp.inc => uninitialized_array.hpp} | 20 ++++++++ cuda/CMakeLists.txt | 29 +---------- cuda/base/math.hpp | 23 --------- cuda/components/uninitialized_array.hpp | 25 --------- hip/CMakeLists.txt | 29 +---------- hip/base/math.hip.hpp | 23 --------- hip/components/uninitialized_array.hip.hpp | 25 --------- 10 files changed, 93 insertions(+), 152 deletions(-) create mode 100644 common/cuda_hip/CMakeLists.txt rename common/cuda_hip/base/{math.hpp.inc => math.hpp} (79%) rename common/cuda_hip/components/{uninitialized_array.hpp.inc => uninitialized_array.hpp} (82%) delete mode 100644 cuda/base/math.hpp delete mode 100644 cuda/components/uninitialized_array.hpp delete mode 100644 hip/base/math.hip.hpp delete mode 100644 hip/components/uninitialized_array.hip.hpp diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 77bdd7230b9..e84ff9f5660 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,2 +1,4 @@ add_subdirectory(unified) +add_subdirectory(cuda_hip) set(GKO_UNIFIED_COMMON_SOURCES ${GKO_UNIFIED_COMMON_SOURCES} PARENT_SCOPE) +set(GKO_CUDA_HIP_COMMON_SOURCES ${GKO_CUDA_HIP_COMMON_SOURCES} PARENT_SCOPE) diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt new file mode 100644 index 00000000000..2cfbe6e9b0d --- /dev/null +++ b/common/cuda_hip/CMakeLists.txt @@ -0,0 +1,51 @@ +include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) +set(CUDA_HIP_SOURCES + base/batch_multi_vector_kernels.cpp + base/device_matrix_data_kernels.cpp + base/kernel_launch.hpp + base/kernel_launch_reduction.hpp + base/kernel_launch_solver.hpp + components/atomic.hpp + components/diagonal_block_manipulation.hpp + components/intrinsics.hpp + components/merging.hpp + components/prefix_sum.hpp + components/prefix_sum_kernels.cpp + components/reduction.hpp + components/searching.hpp + components/segment_scan.hpp + components/sorting.hpp + components/syncfree.hpp + components/thread_ids.hpp + components/warp_blas.hpp + distributed/matrix_kernels.cpp + distributed/partition_helpers_kernels.cpp + distributed/partition_kernels.cpp + distributed/vector_kernels.cpp + factorization/cholesky_kernels.cpp + factorization/factorization_kernels.cpp + factorization/lu_kernels.cpp + factorization/par_ic_kernels.cpp + factorization/par_ilu_kernels.cpp + log/batch_logger.hpp + matrix/batch_csr_kernels.cpp + matrix/batch_dense_kernels.cpp + matrix/batch_ell_kernels.cpp + matrix/coo_kernels.cpp + matrix/dense_kernels.cpp + matrix/diagonal_kernels.cpp + matrix/ell_kernels.cpp + matrix/fbcsr_kernels.cpp + matrix/sellp_kernels.cpp + matrix/sparsity_csr_kernels.cpp + multigrid/pgm_kernels.cpp + preconditioner/isai_kernels.cpp + preconditioner/jacobi_kernels.cpp + reorder/rcm_kernels.cpp + solver/cb_gmres_kernels.cpp + solver/idr_kernels.cpp + solver/multigrid_kernels.cpp + stop/batch_criteria.hpp + ) +list(TRANSFORM CUDA_HIP_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/) +set(GKO_CUDA_HIP_COMMON_SOURCES ${CUDA_HIP_SOURCES} PARENT_SCOPE) diff --git a/common/cuda_hip/base/math.hpp.inc b/common/cuda_hip/base/math.hpp similarity index 79% rename from common/cuda_hip/base/math.hpp.inc rename to common/cuda_hip/base/math.hpp index 430163f3791..44a26cadb53 100644 --- a/common/cuda_hip/base/math.hpp.inc +++ b/common/cuda_hip/base/math.hpp @@ -2,6 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_INC_ +#define GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_INC_ + + +#include + +#include + + +namespace gko { + + // We need this struct, because otherwise we would call a __host__ function in a // __device__ function (even though it is constexpr) template @@ -37,3 +49,9 @@ struct truncate_type_impl> { } // namespace detail + + +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_INC_ \ No newline at end of file diff --git a/common/cuda_hip/components/uninitialized_array.hpp.inc b/common/cuda_hip/components/uninitialized_array.hpp similarity index 82% rename from common/cuda_hip/components/uninitialized_array.hpp.inc rename to common/cuda_hip/components/uninitialized_array.hpp index 932ae8a5caa..215c7f5751a 100644 --- a/common/cuda_hip/components/uninitialized_array.hpp.inc +++ b/common/cuda_hip/components/uninitialized_array.hpp @@ -2,6 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_INC_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_INC_ + + +#include + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + /** * Stores an array with uninitialized contents. * @@ -63,3 +75,11 @@ class uninitialized_array { private: unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size]; }; + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_INC_ \ No newline at end of file diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 3d251ecfa82..505b222bb8d 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -7,9 +7,7 @@ add_instantiation_files(. matrix/fbcsr_kernels.instantiate.cu FBCSR_INSTANTIATE) list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_cuda PRIVATE - base/batch_multi_vector_kernels.cu base/device.cpp - base/device_matrix_data_kernels.cu base/exception.cpp base/executor.cpp base/index_set_kernels.cpp @@ -19,56 +17,31 @@ target_sources(ginkgo_cuda base/stream.cpp base/timer.cpp base/version.cpp - components/prefix_sum_kernels.cu distributed/index_map_kernels.cu - distributed/matrix_kernels.cu - distributed/partition_helpers_kernels.cu - distributed/partition_kernels.cu - distributed/vector_kernels.cu - factorization/cholesky_kernels.cu - factorization/factorization_kernels.cu factorization/ic_kernels.cu factorization/ilu_kernels.cu - factorization/lu_kernels.cu - factorization/par_ic_kernels.cu factorization/par_ict_kernels.cu - factorization/par_ilu_kernels.cu factorization/par_ilut_approx_filter_kernels.cu factorization/par_ilut_filter_kernels.cu factorization/par_ilut_select_common.cu factorization/par_ilut_select_kernels.cu factorization/par_ilut_spgeam_kernels.cu factorization/par_ilut_sweep_kernels.cu - matrix/batch_csr_kernels.cu - matrix/batch_dense_kernels.cu - matrix/batch_ell_kernels.cu - matrix/coo_kernels.cu ${CSR_INSTANTIATE} - matrix/dense_kernels.cu - matrix/diagonal_kernels.cu - matrix/ell_kernels.cu ${FBCSR_INSTANTIATE} matrix/fft_kernels.cu - matrix/sellp_kernels.cu - matrix/sparsity_csr_kernels.cu - multigrid/pgm_kernels.cu preconditioner/batch_jacobi_kernels.cu - preconditioner/isai_kernels.cu preconditioner/jacobi_advanced_apply_kernels.cu preconditioner/jacobi_generate_kernels.cu - preconditioner/jacobi_kernels.cu preconditioner/jacobi_simple_apply_kernels.cu - reorder/rcm_kernels.cu solver/batch_bicgstab_kernels.cu solver/batch_cg_kernels.cu - solver/cb_gmres_kernels.cu - solver/idr_kernels.cu solver/lower_trs_kernels.cu - solver/multigrid_kernels.cu solver/upper_trs_kernels.cu stop/criterion_kernels.cu stop/residual_norm_kernels.cu ${GKO_UNIFIED_COMMON_SOURCES} + ${GKO_CUDA_HIP_COMMON_SOURCES} ) # override the default language mapping for the common files, set them to CUDA foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES) diff --git a/cuda/base/math.hpp b/cuda/base/math.hpp deleted file mode 100644 index d9fa5165cf6..00000000000 --- a/cuda/base/math.hpp +++ /dev/null @@ -1,23 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_BASE_MATH_HPP_ -#define GKO_CUDA_BASE_MATH_HPP_ - - -#include - -#include - - -namespace gko { - - -#include "common/cuda_hip/base/math.hpp.inc" - - -} // namespace gko - - -#endif // GKO_CUDA_BASE_MATH_HPP_ diff --git a/cuda/components/uninitialized_array.hpp b/cuda/components/uninitialized_array.hpp deleted file mode 100644 index b98c812c16d..00000000000 --- a/cuda/components/uninitialized_array.hpp +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ -#define GKO_CUDA_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ - - -#include - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/uninitialized_array.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index bf2d6a6cf58..19f4dd54b2a 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -5,9 +5,7 @@ add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_INSTANT # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) set(GINKGO_HIP_SOURCES - base/batch_multi_vector_kernels.hip.cpp base/device.hip.cpp - base/device_matrix_data_kernels.hip.cpp base/exception.hip.cpp base/executor.hip.cpp base/index_set_kernels.hip.cpp @@ -17,55 +15,30 @@ set(GINKGO_HIP_SOURCES base/stream.hip.cpp base/timer.hip.cpp base/version.hip.cpp - components/prefix_sum_kernels.hip.cpp distributed/index_map_kernels.hip.cpp - distributed/matrix_kernels.hip.cpp - distributed/partition_helpers_kernels.hip.cpp - distributed/partition_kernels.hip.cpp - distributed/vector_kernels.hip.cpp - factorization/cholesky_kernels.hip.cpp - factorization/factorization_kernels.hip.cpp factorization/ic_kernels.hip.cpp factorization/ilu_kernels.hip.cpp - factorization/lu_kernels.hip.cpp - factorization/par_ic_kernels.hip.cpp factorization/par_ict_kernels.hip.cpp - factorization/par_ilu_kernels.hip.cpp factorization/par_ilut_approx_filter_kernels.hip.cpp factorization/par_ilut_filter_kernels.hip.cpp factorization/par_ilut_select_common.hip.cpp factorization/par_ilut_select_kernels.hip.cpp factorization/par_ilut_spgeam_kernels.hip.cpp factorization/par_ilut_sweep_kernels.hip.cpp - matrix/batch_csr_kernels.hip.cpp - matrix/batch_dense_kernels.hip.cpp - matrix/batch_ell_kernels.hip.cpp - matrix/coo_kernels.hip.cpp ${CSR_INSTANTIATE} - matrix/dense_kernels.hip.cpp - matrix/diagonal_kernels.hip.cpp - matrix/ell_kernels.hip.cpp ${FBCSR_INSTANTIATE} - matrix/sellp_kernels.hip.cpp - matrix/sparsity_csr_kernels.hip.cpp - multigrid/pgm_kernels.hip.cpp preconditioner/batch_jacobi_kernels.hip.cpp - preconditioner/isai_kernels.hip.cpp preconditioner/jacobi_advanced_apply_kernels.hip.cpp preconditioner/jacobi_generate_kernels.hip.cpp - preconditioner/jacobi_kernels.hip.cpp preconditioner/jacobi_simple_apply_kernels.hip.cpp - reorder/rcm_kernels.hip.cpp solver/batch_bicgstab_kernels.hip.cpp solver/batch_cg_kernels.hip.cpp - solver/cb_gmres_kernels.hip.cpp - solver/idr_kernels.hip.cpp solver/lower_trs_kernels.hip.cpp - solver/multigrid_kernels.hip.cpp solver/upper_trs_kernels.hip.cpp stop/criterion_kernels.hip.cpp stop/residual_norm_kernels.hip.cpp ${GKO_UNIFIED_COMMON_SOURCES} + ${GKO_CUDA_HIP_COMMON_SOURCES} ) if(hipfft_FOUND) diff --git a/hip/base/math.hip.hpp b/hip/base/math.hip.hpp deleted file mode 100644 index 9f577812f3e..00000000000 --- a/hip/base/math.hip.hpp +++ /dev/null @@ -1,23 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_BASE_MATH_HIP_HPP_ -#define GKO_HIP_BASE_MATH_HIP_HPP_ - - -#include - -#include - - -namespace gko { - - -#include "common/cuda_hip/base/math.hpp.inc" - - -} // namespace gko - - -#endif // GKO_HIP_BASE_MATH_HIP_HPP_ diff --git a/hip/components/uninitialized_array.hip.hpp b/hip/components/uninitialized_array.hip.hpp deleted file mode 100644 index e59d2c21a63..00000000000 --- a/hip/components/uninitialized_array.hip.hpp +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HIP_HPP_ -#define GKO_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HIP_HPP_ - - -#include - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/uninitialized_array.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HIP_HPP_ From 6144a60850e89a360754b84d0a36e5e8f5f28e00 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 29 Jun 2024 15:56:35 +0200 Subject: [PATCH 040/448] adapt headers --- common/unified/base/kernel_launch.hpp | 6 ++---- .../unified/base/kernel_launch_reduction.hpp | 6 ++---- common/unified/base/kernel_launch_solver.hpp | 6 ++---- core/solver/batch_dispatch.hpp | 17 +++++++++-------- cuda/CMakeLists.txt | 2 +- cuda/base/config.hpp | 2 +- cuda/base/cublas_bindings.hpp | 2 +- cuda/base/curand_bindings.hpp | 2 +- cuda/components/format_conversion.cuh | 2 +- cuda/distributed/index_map_kernels.cu | 4 ++-- cuda/factorization/par_ict_kernels.cu | 14 +++++++------- .../par_ilut_approx_filter_kernels.cu | 12 ++++++------ cuda/factorization/par_ilut_filter_kernels.cu | 6 +++--- cuda/factorization/par_ilut_select_common.cu | 14 +++++++------- cuda/factorization/par_ilut_select_kernels.cu | 14 +++++++------- cuda/factorization/par_ilut_spgeam_kernels.cu | 12 ++++++------ cuda/factorization/par_ilut_sweep_kernels.cu | 14 +++++++------- cuda/matrix/csr_kernels.template.cu | 18 +++++++++--------- cuda/preconditioner/batch_jacobi_kernels.cu | 4 ++-- cuda/preconditioner/batch_preconditioners.cuh | 2 +- ...acobi_advanced_apply_kernels.instantiate.cu | 6 +++--- .../jacobi_generate_kernels.instantiate.cu | 10 +++++----- .../jacobi_simple_apply_kernels.instantiate.cu | 6 +++--- cuda/solver/batch_bicgstab_kernels.cu | 6 +++--- cuda/solver/batch_cg_kernels.cu | 6 +++--- cuda/solver/common_trs_kernels.cuh | 8 ++++---- cuda/solver/lower_trs_kernels.cu | 2 +- cuda/solver/upper_trs_kernels.cu | 2 +- cuda/stop/criterion_kernels.cu | 4 ++-- cuda/stop/residual_norm_kernels.cu | 4 ++-- cuda/test/base/math.cu | 2 +- hip/base/config.hip.hpp | 2 +- hip/base/hipblas_bindings.hip.hpp | 2 +- hip/base/hiprand_bindings.hip.hpp | 2 +- hip/components/format_conversion.hip.hpp | 2 +- hip/distributed/index_map_kernels.hip.cpp | 4 ++-- hip/factorization/par_ict_kernels.hip.cpp | 14 +++++++------- .../par_ilut_approx_filter_kernels.hip.cpp | 12 ++++++------ .../par_ilut_filter_kernels.hip.cpp | 6 +++--- .../par_ilut_select_common.hip.cpp | 14 +++++++------- .../par_ilut_select_kernels.hip.cpp | 14 +++++++------- .../par_ilut_spgeam_kernels.hip.cpp | 12 ++++++------ .../par_ilut_sweep_kernels.hip.cpp | 14 +++++++------- hip/matrix/csr_kernels.template.hip.cpp | 18 +++++++++--------- .../batch_jacobi_kernels.hip.cpp | 10 +++++----- .../batch_preconditioners.hip.hpp | 2 +- ..._advanced_apply_kernels.instantiate.hip.cpp | 6 +++--- .../jacobi_generate_kernels.hip.cpp | 10 +++++----- ...jacobi_generate_kernels.instantiate.hip.cpp | 10 +++++----- .../jacobi_simple_apply_kernels.hip.cpp | 6 +++--- ...bi_simple_apply_kernels.instantiate.hip.cpp | 6 +++--- hip/solver/batch_bicgstab_kernels.hip.cpp | 8 ++++---- hip/solver/batch_cg_kernels.hip.cpp | 8 ++++---- hip/solver/common_trs_kernels.hip.hpp | 2 +- hip/solver/lower_trs_kernels.hip.cpp | 2 +- hip/solver/upper_trs_kernels.hip.cpp | 2 +- hip/stop/criterion_kernels.hip.cpp | 4 ++-- hip/stop/residual_norm_kernels.hip.cpp | 4 ++-- hip/test/base/math.hip.cpp | 2 +- 59 files changed, 204 insertions(+), 209 deletions(-) diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp index fad327ae3b1..73d37eb2ac2 100644 --- a/common/unified/base/kernel_launch.hpp +++ b/common/unified/base/kernel_launch.hpp @@ -269,10 +269,8 @@ typename to_device_type_impl::type map_to_device(T&& param) } // namespace gko -#if defined(GKO_COMPILING_CUDA) -#include "cuda/base/kernel_launch.cuh" -#elif defined(GKO_COMPILING_HIP) -#include "hip/base/kernel_launch.hip.hpp" +#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) +#include "common/cuda_hip/base/kernel_launch.hpp" #elif defined(GKO_COMPILING_DPCPP) #include "dpcpp/base/kernel_launch.dp.hpp" #elif defined(GKO_COMPILING_OMP) diff --git a/common/unified/base/kernel_launch_reduction.hpp b/common/unified/base/kernel_launch_reduction.hpp index c3158d35a1c..b7b3e258dd4 100644 --- a/common/unified/base/kernel_launch_reduction.hpp +++ b/common/unified/base/kernel_launch_reduction.hpp @@ -19,10 +19,8 @@ {} -#if defined(GKO_COMPILING_CUDA) -#include "cuda/base/kernel_launch_reduction.cuh" -#elif defined(GKO_COMPILING_HIP) -#include "hip/base/kernel_launch_reduction.hip.hpp" +#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) +#include "common/cuda_hip/base/kernel_launch_reduction.hpp" #elif defined(GKO_COMPILING_DPCPP) #include "dpcpp/base/kernel_launch_reduction.dp.hpp" #elif defined(GKO_COMPILING_OMP) diff --git a/common/unified/base/kernel_launch_solver.hpp b/common/unified/base/kernel_launch_solver.hpp index f4240805c64..14f2cbfeacf 100644 --- a/common/unified/base/kernel_launch_solver.hpp +++ b/common/unified/base/kernel_launch_solver.hpp @@ -107,10 +107,8 @@ const device_type* row_vector(const matrix::Dense* mtx) } // namespace gko -#if defined(GKO_COMPILING_CUDA) -#include "cuda/base/kernel_launch_solver.cuh" -#elif defined(GKO_COMPILING_HIP) -#include "hip/base/kernel_launch_solver.hip.hpp" +#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) +#include "common/cuda_hip/base/kernel_launch_solver.hpp" #elif defined(GKO_COMPILING_DPCPP) #include "dpcpp/base/kernel_launch_solver.dp.hpp" #elif defined(GKO_COMPILING_OMP) diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp index 8a142a5224a..ce8c4d86e71 100644 --- a/core/solver/batch_dispatch.hpp +++ b/core/solver/batch_dispatch.hpp @@ -21,15 +21,15 @@ #include "core/matrix/batch_struct.hpp" -#if defined GKO_COMPILING_CUDA +#if defined(GKO_COMPILING_CUDA) +#include "common/cuda_hip/log/batch_logger.hpp" +#include "common/cuda_hip/stop/batch_criteria.hpp" #include "cuda/base/batch_struct.hpp" #include "cuda/components/cooperative_groups.cuh" -#include "cuda/log/batch_logger.cuh" #include "cuda/matrix/batch_struct.hpp" #include "cuda/preconditioner/batch_preconditioners.cuh" -#include "cuda/stop/batch_criteria.cuh" namespace gko { @@ -37,11 +37,12 @@ namespace batch { namespace solver { -namespace device = gko::kernels::cuda; +namespace device = gko::kernels::GKO_DEVICE_NAMESPACE; template -using DeviceValueType = typename gko::kernels::cuda::cuda_type; +using DeviceValueType = + typename gko::kernels::GKO_DEVICE_NAMESPACE::device_type; } // namespace solver @@ -49,15 +50,15 @@ using DeviceValueType = typename gko::kernels::cuda::cuda_type; } // namespace gko -#elif defined GKO_COMPILING_HIP +#elif defined(GKO_COMPILING_HIP) +#include "common/cuda_hip/log/batch_logger.hpp" +#include "common/cuda_hip/stop/batch_criteria.hpp" #include "hip/base/batch_struct.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" -#include "hip/log/batch_logger.hip.hpp" #include "hip/matrix/batch_struct.hip.hpp" #include "hip/preconditioner/batch_preconditioners.hip.hpp" -#include "hip/stop/batch_criteria.hip.hpp" namespace gko { diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 505b222bb8d..bef62c12a9a 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -44,7 +44,7 @@ target_sources(ginkgo_cuda ${GKO_CUDA_HIP_COMMON_SOURCES} ) # override the default language mapping for the common files, set them to CUDA -foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES) +foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES) set_source_files_properties(${source_file} PROPERTIES LANGUAGE CUDA) endforeach(source_file) if(GINKGO_JACOBI_FULL_OPTIMIZATIONS) diff --git a/cuda/base/config.hpp b/cuda/base/config.hpp index 1ff249066bd..fe280c76dec 100644 --- a/cuda/base/config.hpp +++ b/cuda/base/config.hpp @@ -8,7 +8,7 @@ #include -#include "cuda/base/math.hpp" +#include "common/cuda_hip/base/math.hpp" namespace gko { diff --git a/cuda/base/cublas_bindings.hpp b/cuda/base/cublas_bindings.hpp index bc8da5851d5..ae5e66b6448 100644 --- a/cuda/base/cublas_bindings.hpp +++ b/cuda/base/cublas_bindings.hpp @@ -10,8 +10,8 @@ #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/types.hpp" -#include "cuda/base/math.hpp" namespace gko { diff --git a/cuda/base/curand_bindings.hpp b/cuda/base/curand_bindings.hpp index 8d31ac2e90e..eb3dbee6b7b 100644 --- a/cuda/base/curand_bindings.hpp +++ b/cuda/base/curand_bindings.hpp @@ -10,8 +10,8 @@ #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/types.hpp" -#include "cuda/base/math.hpp" namespace gko { diff --git a/cuda/components/format_conversion.cuh b/cuda/components/format_conversion.cuh index 6690368cc4f..9ece2cdffe4 100644 --- a/cuda/components/format_conversion.cuh +++ b/cuda/components/format_conversion.cuh @@ -10,7 +10,7 @@ #include #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "cuda/components/thread_ids.cuh" +#include "common/cuda_hip/components/thread_ids.hpp" #ifdef GINKGO_BENCHMARK_ENABLE_TUNING diff --git a/cuda/distributed/index_map_kernels.cu b/cuda/distributed/index_map_kernels.cu index 42e8f118301..e55a4148e51 100644 --- a/cuda/distributed/index_map_kernels.cu +++ b/cuda/distributed/index_map_kernels.cu @@ -19,9 +19,9 @@ #include +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/searching.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/components/atomic.cuh" -#include "cuda/components/searching.cuh" namespace gko { diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu index fb7a0b0370a..62964925aa4 100644 --- a/cuda/factorization/par_ict_kernels.cu +++ b/cuda/factorization/par_ict_kernels.cu @@ -10,20 +10,20 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" #include "common/cuda_hip/components/memory.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/merging.cuh" -#include "cuda/components/prefix_sum.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/searching.cuh" -#include "cuda/components/thread_ids.cuh" namespace gko { diff --git a/cuda/factorization/par_ilut_approx_filter_kernels.cu b/cuda/factorization/par_ilut_approx_filter_kernels.cu index 51127ffd43b..93c0ef7fc95 100644 --- a/cuda/factorization/par_ilut_approx_filter_kernels.cu +++ b/cuda/factorization/par_ilut_approx_filter_kernels.cu @@ -11,20 +11,20 @@ #include #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/atomic.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/sorting.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/prefix_sum.cuh" -#include "cuda/components/sorting.cuh" -#include "cuda/components/thread_ids.cuh" #include "cuda/factorization/par_ilut_select_common.cuh" diff --git a/cuda/factorization/par_ilut_filter_kernels.cu b/cuda/factorization/par_ilut_filter_kernels.cu index e15c7ec4cf6..3d6b41f07e6 100644 --- a/cuda/factorization/par_ilut_filter_kernels.cu +++ b/cuda/factorization/par_ilut_filter_kernels.cu @@ -9,18 +9,18 @@ #include #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/thread_ids.cuh" namespace gko { diff --git a/cuda/factorization/par_ilut_select_common.cu b/cuda/factorization/par_ilut_select_common.cu index 3f910f4884e..e0b81a81a1c 100644 --- a/cuda/factorization/par_ilut_select_common.cu +++ b/cuda/factorization/par_ilut_select_common.cu @@ -4,15 +4,15 @@ #include "cuda/factorization/par_ilut_select_common.cuh" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/sorting.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/prefix_sum.cuh" -#include "cuda/components/searching.cuh" -#include "cuda/components/sorting.cuh" -#include "cuda/components/thread_ids.cuh" namespace gko { diff --git a/cuda/factorization/par_ilut_select_kernels.cu b/cuda/factorization/par_ilut_select_kernels.cu index ac37e3a7595..a2395a16aea 100644 --- a/cuda/factorization/par_ilut_select_kernels.cu +++ b/cuda/factorization/par_ilut_select_kernels.cu @@ -8,16 +8,16 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/sorting.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/prefix_sum.cuh" -#include "cuda/components/searching.cuh" -#include "cuda/components/sorting.cuh" -#include "cuda/components/thread_ids.cuh" #include "cuda/factorization/par_ilut_select_common.cuh" diff --git a/cuda/factorization/par_ilut_spgeam_kernels.cu b/cuda/factorization/par_ilut_spgeam_kernels.cu index 83ec9c974b8..7277093314a 100644 --- a/cuda/factorization/par_ilut_spgeam_kernels.cu +++ b/cuda/factorization/par_ilut_spgeam_kernels.cu @@ -8,20 +8,20 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/merging.cuh" -#include "cuda/components/prefix_sum.cuh" -#include "cuda/components/searching.cuh" -#include "cuda/components/thread_ids.cuh" namespace gko { diff --git a/cuda/factorization/par_ilut_sweep_kernels.cu b/cuda/factorization/par_ilut_sweep_kernels.cu index 8bdf6c9380a..9e277549aa4 100644 --- a/cuda/factorization/par_ilut_sweep_kernels.cu +++ b/cuda/factorization/par_ilut_sweep_kernels.cu @@ -8,21 +8,21 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" #include "common/cuda_hip/components/memory.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/merging.cuh" -#include "cuda/components/prefix_sum.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/searching.cuh" -#include "cuda/components/thread_ids.cuh" namespace gko { diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu index 600f4ffb5a3..151351c9204 100644 --- a/cuda/matrix/csr_kernels.template.cu +++ b/cuda/matrix/csr_kernels.template.cu @@ -25,11 +25,20 @@ #include "accessor/cuda_hip_helper.hpp" #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/pointer_mode_guard.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/atomic.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/array_access.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" @@ -40,16 +49,7 @@ #include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/math.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/components/atomic.cuh" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/merging.cuh" -#include "cuda/components/prefix_sum.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/segment_scan.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" namespace gko { diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu index 1bc39df9781..e31e17dcafc 100644 --- a/cuda/preconditioner/batch_jacobi_kernels.cu +++ b/cuda/preconditioner/batch_jacobi_kernels.cu @@ -8,6 +8,8 @@ #include #include +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/base/batch_struct.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" @@ -18,8 +20,6 @@ #include "cuda/base/config.hpp" #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/thread_ids.cuh" #include "cuda/matrix/batch_struct.hpp" #include "cuda/preconditioner/jacobi_common.hpp" diff --git a/cuda/preconditioner/batch_preconditioners.cuh b/cuda/preconditioner/batch_preconditioners.cuh index e83d6e04ee9..01001c036b2 100644 --- a/cuda/preconditioner/batch_preconditioners.cuh +++ b/cuda/preconditioner/batch_preconditioners.cuh @@ -7,9 +7,9 @@ #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" #include "core/matrix/batch_struct.hpp" #include "core/preconditioner/batch_jacobi_helpers.hpp" -#include "cuda/components/reduction.cuh" namespace gko { diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu index e0b9145a0f7..60823cf6f4b 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu @@ -5,16 +5,16 @@ #include #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/warp_blas.cuh" #include "cuda/preconditioner/jacobi_common.hpp" diff --git a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu index c12df449e42..ff36c8efb1b 100644 --- a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu +++ b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu @@ -6,18 +6,18 @@ #include #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/diagonal_block_manipulation.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/extended_float.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/diagonal_block_manipulation.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" -#include "cuda/components/warp_blas.cuh" #include "cuda/preconditioner/jacobi_common.hpp" diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu index 45af2ec668f..d727c9439f9 100644 --- a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu +++ b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu @@ -5,16 +5,16 @@ #include #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/warp_blas.cuh" #include "cuda/preconditioner/jacobi_common.hpp" diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 28efaf07475..8d76f865a20 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -14,14 +14,14 @@ #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "cuda/base/batch_struct.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" #include "cuda/matrix/batch_struct.hpp" diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index cff72652629..2083cd98b5a 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -13,14 +13,14 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "cuda/base/batch_struct.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" #include "cuda/matrix/batch_struct.hpp" diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index a205f155487..7cedf2fbd2e 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -17,17 +17,17 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/pointer_mode_guard.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/atomic.hpp" #include "common/cuda_hip/components/memory.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/array_access.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" namespace gko { diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu index 898ffb92552..b37f6536b0f 100644 --- a/cuda/solver/lower_trs_kernels.cu +++ b/cuda/solver/lower_trs_kernels.cu @@ -13,9 +13,9 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/types.hpp" -#include "cuda/base/math.hpp" #include "cuda/solver/common_trs_kernels.cuh" diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu index b1f9e43ed2c..eb7d8386083 100644 --- a/cuda/solver/upper_trs_kernels.cu +++ b/cuda/solver/upper_trs_kernels.cu @@ -13,9 +13,9 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/types.hpp" -#include "cuda/base/math.hpp" #include "cuda/solver/common_trs_kernels.cuh" diff --git a/cuda/stop/criterion_kernels.cu b/cuda/stop/criterion_kernels.cu index 20538e87304..fa596f0c03f 100644 --- a/cuda/stop/criterion_kernels.cu +++ b/cuda/stop/criterion_kernels.cu @@ -8,9 +8,9 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/types.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/thread_ids.cuh" +#include "common/cuda_hip/components/thread_ids.hpp" namespace gko { diff --git a/cuda/stop/residual_norm_kernels.cu b/cuda/stop/residual_norm_kernels.cu index d59f937b918..e52a74cf422 100644 --- a/cuda/stop/residual_norm_kernels.cu +++ b/cuda/stop/residual_norm_kernels.cu @@ -8,10 +8,10 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/base/array_access.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/thread_ids.cuh" namespace gko { diff --git a/cuda/test/base/math.cu b/cuda/test/base/math.cu index e3c1d78ed39..71532b45e80 100644 --- a/cuda/test/base/math.cu +++ b/cuda/test/base/math.cu @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "cuda/base/math.hpp" +#include "common/cuda_hip/base/math.hpp" #include #include diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp index e74153cc34e..114eb2f0f0a 100644 --- a/hip/base/config.hip.hpp +++ b/hip/base/config.hip.hpp @@ -8,8 +8,8 @@ #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" -#include "hip/base/math.hip.hpp" namespace gko { diff --git a/hip/base/hipblas_bindings.hip.hpp b/hip/base/hipblas_bindings.hip.hpp index 21c44e664b8..4641b64277d 100644 --- a/hip/base/hipblas_bindings.hip.hpp +++ b/hip/base/hipblas_bindings.hip.hpp @@ -16,9 +16,9 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" -#include "hip/base/math.hip.hpp" namespace gko { diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp index a76274c45a7..7cd76b9d320 100644 --- a/hip/base/hiprand_bindings.hip.hpp +++ b/hip/base/hiprand_bindings.hip.hpp @@ -15,9 +15,9 @@ #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" -#include "hip/base/math.hip.hpp" namespace gko { diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp index d2cbc3062a5..2e6c4eb1236 100644 --- a/hip/components/format_conversion.hip.hpp +++ b/hip/components/format_conversion.hip.hpp @@ -11,7 +11,7 @@ #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "hip/components/thread_ids.hip.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #ifdef GINKGO_BENCHMARK_ENABLE_TUNING diff --git a/hip/distributed/index_map_kernels.hip.cpp b/hip/distributed/index_map_kernels.hip.cpp index 536b09a1bb1..c722952f85d 100644 --- a/hip/distributed/index_map_kernels.hip.cpp +++ b/hip/distributed/index_map_kernels.hip.cpp @@ -19,9 +19,9 @@ #include +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/searching.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/searching.hip.hpp" namespace gko { diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp index 99b2f09274b..ed7b104471b 100644 --- a/hip/factorization/par_ict_kernels.hip.cpp +++ b/hip/factorization/par_ict_kernels.hip.cpp @@ -10,20 +10,20 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" #include "common/cuda_hip/components/memory.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/merging.hip.hpp" -#include "hip/components/prefix_sum.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/searching.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" namespace gko { diff --git a/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp index b4fdd7e6e6d..31482cd4034 100644 --- a/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp +++ b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp @@ -11,21 +11,21 @@ #include #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/atomic.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/sorting.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/prefix_sum.hip.hpp" -#include "hip/components/sorting.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" #include "hip/factorization/par_ilut_select_common.hip.hpp" diff --git a/hip/factorization/par_ilut_filter_kernels.hip.cpp b/hip/factorization/par_ilut_filter_kernels.hip.cpp index 8f91e6f7087..bbe0b197d7c 100644 --- a/hip/factorization/par_ilut_filter_kernels.hip.cpp +++ b/hip/factorization/par_ilut_filter_kernels.hip.cpp @@ -9,18 +9,18 @@ #include #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" namespace gko { diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/hip/factorization/par_ilut_select_common.hip.cpp index 098ce5c9887..89ceca0a024 100644 --- a/hip/factorization/par_ilut_select_common.hip.cpp +++ b/hip/factorization/par_ilut_select_common.hip.cpp @@ -10,15 +10,15 @@ #include "hip/factorization/par_ilut_select_common.hip.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/sorting.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/prefix_sum.hip.hpp" -#include "hip/components/searching.hip.hpp" -#include "hip/components/sorting.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" namespace gko { diff --git a/hip/factorization/par_ilut_select_kernels.hip.cpp b/hip/factorization/par_ilut_select_kernels.hip.cpp index 55180bc3d05..2e75f7de81b 100644 --- a/hip/factorization/par_ilut_select_kernels.hip.cpp +++ b/hip/factorization/par_ilut_select_kernels.hip.cpp @@ -8,16 +8,16 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/sorting.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/prefix_sum.hip.hpp" -#include "hip/components/searching.hip.hpp" -#include "hip/components/sorting.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" #include "hip/factorization/par_ilut_select_common.hip.hpp" diff --git a/hip/factorization/par_ilut_spgeam_kernels.hip.cpp b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp index 200a16ea849..5757e00d2a3 100644 --- a/hip/factorization/par_ilut_spgeam_kernels.hip.cpp +++ b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp @@ -8,20 +8,20 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/merging.hip.hpp" -#include "hip/components/prefix_sum.hip.hpp" -#include "hip/components/searching.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" namespace gko { diff --git a/hip/factorization/par_ilut_sweep_kernels.hip.cpp b/hip/factorization/par_ilut_sweep_kernels.hip.cpp index b3994706567..de271d6eebd 100644 --- a/hip/factorization/par_ilut_sweep_kernels.hip.cpp +++ b/hip/factorization/par_ilut_sweep_kernels.hip.cpp @@ -8,21 +8,21 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" #include "common/cuda_hip/components/memory.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/merging.hip.hpp" -#include "hip/components/prefix_sum.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/searching.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" namespace gko { diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp index acd0b0144bb..f7766b8648b 100644 --- a/hip/matrix/csr_kernels.template.hip.cpp +++ b/hip/matrix/csr_kernels.template.hip.cpp @@ -25,11 +25,20 @@ #include "accessor/cuda_hip_helper.hpp" #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/pointer_mode_guard.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/atomic.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/array_access.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" @@ -40,16 +49,7 @@ #include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/math.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/merging.hip.hpp" -#include "hip/components/prefix_sum.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/segment_scan.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" namespace gko { diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp index db6e5a27b58..a112e3beb92 100644 --- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp +++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp @@ -8,6 +8,11 @@ #include #include +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/diagonal_block_manipulation.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/batch_struct.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" @@ -15,13 +20,8 @@ #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/batch_struct.hip.hpp" #include "hip/base/config.hip.hpp" -#include "hip/base/math.hip.hpp" #include "hip/base/types.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/diagonal_block_manipulation.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" -#include "hip/components/warp_blas.hip.hpp" #include "hip/matrix/batch_struct.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" diff --git a/hip/preconditioner/batch_preconditioners.hip.hpp b/hip/preconditioner/batch_preconditioners.hip.hpp index f3969c16b81..f62000ff46f 100644 --- a/hip/preconditioner/batch_preconditioners.hip.hpp +++ b/hip/preconditioner/batch_preconditioners.hip.hpp @@ -7,9 +7,9 @@ #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" #include "core/matrix/batch_struct.hpp" #include "core/preconditioner/batch_jacobi_helpers.hpp" -#include "hip/components/reduction.hip.hpp" namespace gko { diff --git a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp index 7e6311bcd52..d30f4edd787 100644 --- a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp +++ b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp @@ -5,17 +5,17 @@ #include #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/warp_blas.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" diff --git a/hip/preconditioner/jacobi_generate_kernels.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.hip.cpp index 9f2d3238a83..3f6d3a4e91f 100644 --- a/hip/preconditioner/jacobi_generate_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_generate_kernels.hip.cpp @@ -6,19 +6,19 @@ #include #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/diagonal_block_manipulation.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/extended_float.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/diagonal_block_manipulation.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" -#include "hip/components/warp_blas.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" diff --git a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp index 3685df4aa0e..3c18703557d 100644 --- a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp +++ b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp @@ -6,18 +6,18 @@ #include #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/diagonal_block_manipulation.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/extended_float.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/diagonal_block_manipulation.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" -#include "hip/components/warp_blas.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp index d922d178f88..563f5829536 100644 --- a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp @@ -5,17 +5,17 @@ #include #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/warp_blas.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp index baa847c58a5..7a6e2a46b04 100644 --- a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp @@ -5,16 +5,16 @@ #include #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/warp_blas.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index 44e2f0f3c48..96587f8479e 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -11,18 +11,18 @@ #include #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/math.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" #include "hip/matrix/batch_struct.hip.hpp" diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index 450d02a302c..e12445b2c84 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -11,18 +11,18 @@ #include #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/math.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" #include "hip/matrix/batch_struct.hip.hpp" diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp index ce5cd4192a9..b029e09d400 100644 --- a/hip/solver/common_trs_kernels.hip.hpp +++ b/hip/solver/common_trs_kernels.hip.hpp @@ -20,13 +20,13 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/pointer_mode_guard.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/types.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/math.hip.hpp" namespace gko { diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp index 322c87d37b3..5eab76ed5fa 100644 --- a/hip/solver/lower_trs_kernels.hip.cpp +++ b/hip/solver/lower_trs_kernels.hip.cpp @@ -18,10 +18,10 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/types.hpp" -#include "hip/base/math.hip.hpp" #include "hip/solver/common_trs_kernels.hip.hpp" diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp index 6be850959cb..fb480d9b22d 100644 --- a/hip/solver/upper_trs_kernels.hip.cpp +++ b/hip/solver/upper_trs_kernels.hip.cpp @@ -18,10 +18,10 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/types.hpp" -#include "hip/base/math.hip.hpp" #include "hip/solver/common_trs_kernels.hip.hpp" diff --git a/hip/stop/criterion_kernels.hip.cpp b/hip/stop/criterion_kernels.hip.cpp index 8f856f0ed8d..0b8e300f978 100644 --- a/hip/stop/criterion_kernels.hip.cpp +++ b/hip/stop/criterion_kernels.hip.cpp @@ -8,9 +8,9 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/types.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" namespace gko { diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp index eb6c89a2e2e..0a9af423128 100644 --- a/hip/stop/residual_norm_kernels.hip.cpp +++ b/hip/stop/residual_norm_kernels.hip.cpp @@ -8,11 +8,11 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/base/array_access.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" namespace gko { diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp index 1a882989854..01fb96afa7c 100644 --- a/hip/test/base/math.hip.cpp +++ b/hip/test/base/math.hip.cpp @@ -8,7 +8,7 @@ // clang-format on -#include "hip/base/math.hip.hpp" +#include "common/cuda_hip/base/math.hpp" #include #include From e89b595f9cca516dbfa76a0cd6a260b1803f2400 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 29 Jun 2024 16:10:10 +0200 Subject: [PATCH 041/448] fix fbcsr instantiation --- common/cuda_hip/CMakeLists.txt | 18 ------- .../matrix/fbcsr_kernels.instantiate.cpp | 6 +-- ...kernels.cpp => fbcsr_kernels.template.cpp} | 0 cuda/CMakeLists.txt | 4 +- hip/CMakeLists.txt | 2 +- hip/matrix/fbcsr_kernels.instantiate.hip.cpp | 47 ------------------- 6 files changed, 6 insertions(+), 71 deletions(-) rename cuda/matrix/fbcsr_kernels.instantiate.cu => common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp (90%) rename common/cuda_hip/matrix/{fbcsr_kernels.cpp => fbcsr_kernels.template.cpp} (100%) delete mode 100644 hip/matrix/fbcsr_kernels.instantiate.hip.cpp diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index 2cfbe6e9b0d..79af0c5fd0d 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -2,22 +2,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) set(CUDA_HIP_SOURCES base/batch_multi_vector_kernels.cpp base/device_matrix_data_kernels.cpp - base/kernel_launch.hpp - base/kernel_launch_reduction.hpp - base/kernel_launch_solver.hpp - components/atomic.hpp - components/diagonal_block_manipulation.hpp - components/intrinsics.hpp - components/merging.hpp - components/prefix_sum.hpp components/prefix_sum_kernels.cpp - components/reduction.hpp - components/searching.hpp - components/segment_scan.hpp - components/sorting.hpp - components/syncfree.hpp - components/thread_ids.hpp - components/warp_blas.hpp distributed/matrix_kernels.cpp distributed/partition_helpers_kernels.cpp distributed/partition_kernels.cpp @@ -27,7 +12,6 @@ set(CUDA_HIP_SOURCES factorization/lu_kernels.cpp factorization/par_ic_kernels.cpp factorization/par_ilu_kernels.cpp - log/batch_logger.hpp matrix/batch_csr_kernels.cpp matrix/batch_dense_kernels.cpp matrix/batch_ell_kernels.cpp @@ -35,7 +19,6 @@ set(CUDA_HIP_SOURCES matrix/dense_kernels.cpp matrix/diagonal_kernels.cpp matrix/ell_kernels.cpp - matrix/fbcsr_kernels.cpp matrix/sellp_kernels.cpp matrix/sparsity_csr_kernels.cpp multigrid/pgm_kernels.cpp @@ -45,7 +28,6 @@ set(CUDA_HIP_SOURCES solver/cb_gmres_kernels.cpp solver/idr_kernels.cpp solver/multigrid_kernels.cpp - stop/batch_criteria.hpp ) list(TRANSFORM CUDA_HIP_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/) set(GKO_CUDA_HIP_COMMON_SOURCES ${CUDA_HIP_SOURCES} PARENT_SCOPE) diff --git a/cuda/matrix/fbcsr_kernels.instantiate.cu b/common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp similarity index 90% rename from cuda/matrix/fbcsr_kernels.instantiate.cu rename to common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp index f6165ac5e5c..a3beaac4a85 100644 --- a/cuda/matrix/fbcsr_kernels.instantiate.cu +++ b/common/cuda_hip/matrix/fbcsr_kernels.instantiate.cpp @@ -2,12 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "cuda/matrix/fbcsr_kernels.template.cu" +#include "common/cuda_hip/matrix/fbcsr_kernels.template.cpp" namespace gko { namespace kernels { -namespace cuda { +namespace GKO_DEVICE_NAMESPACE { /** * @brief The fixed-size block compressed sparse row matrix format namespace. * @@ -42,6 +42,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace fbcsr -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/common/cuda_hip/matrix/fbcsr_kernels.cpp b/common/cuda_hip/matrix/fbcsr_kernels.template.cpp similarity index 100% rename from common/cuda_hip/matrix/fbcsr_kernels.cpp rename to common/cuda_hip/matrix/fbcsr_kernels.template.cpp diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index bef62c12a9a..4dd7bccd2c9 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.18 FATAL_ERROR) add_library(ginkgo_cuda $ "") include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(. matrix/csr_kernels.instantiate.cu CSR_INSTANTIATE) -add_instantiation_files(. matrix/fbcsr_kernels.instantiate.cu FBCSR_INSTANTIATE) +add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_cuda @@ -44,7 +44,7 @@ target_sources(ginkgo_cuda ${GKO_CUDA_HIP_COMMON_SOURCES} ) # override the default language mapping for the common files, set them to CUDA -foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES) +foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES FBCSR_INSTANTIATE) set_source_files_properties(${source_file} PROPERTIES LANGUAGE CUDA) endforeach(source_file) if(GINKGO_JACOBI_FULL_OPTIMIZATIONS) diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 19f4dd54b2a..3de4f4b4d65 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.21) include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_INSTANTIATE) -add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_INSTANTIATE) +add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) set(GINKGO_HIP_SOURCES diff --git a/hip/matrix/fbcsr_kernels.instantiate.hip.cpp b/hip/matrix/fbcsr_kernels.instantiate.hip.cpp deleted file mode 100644 index 54e90fc4297..00000000000 --- a/hip/matrix/fbcsr_kernels.instantiate.hip.cpp +++ /dev/null @@ -1,47 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "hip/matrix/fbcsr_kernels.template.hip.cpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The fixed-size block compressed sparse row matrix format namespace. - * - * @ingroup fbcsr - */ -namespace fbcsr { - - -// begin -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); -// end - - -} // namespace fbcsr -} // namespace hip -} // namespace kernels -} // namespace gko From 2b71d3de0c9da74198037e842d9720ac3450b9a8 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 29 Jun 2024 18:29:33 +0200 Subject: [PATCH 042/448] fix includes for thrust and sparselib block --- .../base/sparselib_block_bindings.hpp | 18 +++++++++ cuda/base/thrust.cuh | 31 -------------- cuda/distributed/index_map_kernels.cu | 2 +- cuda/matrix/csr_kernels.template.cu | 2 +- hip/base/thrust.hip.hpp | 40 ------------------- hip/distributed/index_map_kernels.hip.cpp | 2 +- hip/matrix/csr_kernels.template.hip.cpp | 2 +- 7 files changed, 22 insertions(+), 75 deletions(-) create mode 100644 common/cuda_hip/base/sparselib_block_bindings.hpp delete mode 100644 cuda/base/thrust.cuh delete mode 100644 hip/base/thrust.hip.hpp diff --git a/common/cuda_hip/base/sparselib_block_bindings.hpp b/common/cuda_hip/base/sparselib_block_bindings.hpp new file mode 100644 index 00000000000..38bbebc6c14 --- /dev/null +++ b/common/cuda_hip/base/sparselib_block_bindings.hpp @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BLOCK_BINDINGS_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BLOCK_BINDINGS_HPP_ + + +#if defined(GKO_COMPILING_CUDA) +#include "cuda/base/cusparse_block_bindings.hpp" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/hipsparse_block_bindings.hip.hpp" +#else +#error "Executor definition missing" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BLOCK_BINDINGS_HPP_ diff --git a/cuda/base/thrust.cuh b/cuda/base/thrust.cuh deleted file mode 100644 index 5d5d58e0f33..00000000000 --- a/cuda/base/thrust.cuh +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_BASE_THRUST_CUH_ -#define GKO_CUDA_BASE_THRUST_CUH_ - - -#include -#include - -#include - - -namespace gko { -namespace kernels { -namespace cuda { - - -inline auto thrust_policy(std::shared_ptr exec) -{ - return thrust::cuda::par.on(exec->get_stream()); -} - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_BASE_THRUST_CUH_ diff --git a/cuda/distributed/index_map_kernels.cu b/cuda/distributed/index_map_kernels.cu index e55a4148e51..3c23d098a0e 100644 --- a/cuda/distributed/index_map_kernels.cu +++ b/cuda/distributed/index_map_kernels.cu @@ -19,9 +19,9 @@ #include +#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/atomic.hpp" #include "common/cuda_hip/components/searching.hpp" -#include "cuda/base/thrust.cuh" namespace gko { diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu index 151351c9204..c8d193e09af 100644 --- a/cuda/matrix/csr_kernels.template.cu +++ b/cuda/matrix/csr_kernels.template.cu @@ -28,6 +28,7 @@ #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/pointer_mode_guard.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/atomic.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" @@ -49,7 +50,6 @@ #include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/thrust.cuh" namespace gko { diff --git a/hip/base/thrust.hip.hpp b/hip/base/thrust.hip.hpp deleted file mode 100644 index 2aecdd79328..00000000000 --- a/hip/base/thrust.hip.hpp +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_BASE_THRUST_HIP_HPP_ -#define GKO_HIP_BASE_THRUST_HIP_HPP_ - - -#include - -#include -#include -#if GINKGO_HIP_PLATFORM_HCC -#include -#else -#include -#endif - - -namespace gko { -namespace kernels { -namespace hip { - - -inline auto thrust_policy(std::shared_ptr exec) -{ -#if GINKGO_HIP_PLATFORM_HCC - return thrust::hip::par.on(exec->get_stream()); -#else - return thrust::cuda::par.on(exec->get_stream()); -#endif -} - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_BASE_THRUST_HIP_HPP_ diff --git a/hip/distributed/index_map_kernels.hip.cpp b/hip/distributed/index_map_kernels.hip.cpp index c722952f85d..67ff2f72857 100644 --- a/hip/distributed/index_map_kernels.hip.cpp +++ b/hip/distributed/index_map_kernels.hip.cpp @@ -19,9 +19,9 @@ #include +#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/atomic.hpp" #include "common/cuda_hip/components/searching.hpp" -#include "hip/base/thrust.hip.hpp" namespace gko { diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp index f7766b8648b..473361029c8 100644 --- a/hip/matrix/csr_kernels.template.hip.cpp +++ b/hip/matrix/csr_kernels.template.hip.cpp @@ -29,6 +29,7 @@ #include "common/cuda_hip/base/pointer_mode_guard.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/atomic.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" @@ -49,7 +50,6 @@ #include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/thrust.hip.hpp" namespace gko { From 46d9259c008b588efd3d953969e2b726a6c951b7 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 29 Jun 2024 18:30:50 +0200 Subject: [PATCH 043/448] revert batch changes --- core/solver/batch_dispatch.hpp | 17 +++--- cuda/base/batch_multi_vector_kernels.cu | 56 +++++++++++++++++++ cuda/log/batch_logger.cuh | 27 +++++++++ cuda/matrix/batch_csr_kernels.cu | 55 ++++++++++++++++++ cuda/matrix/batch_dense_kernels.cu | 56 +++++++++++++++++++ cuda/matrix/batch_ell_kernels.cu | 55 ++++++++++++++++++ cuda/preconditioner/batch_jacobi_kernels.cu | 4 +- cuda/preconditioner/batch_preconditioners.cuh | 2 +- cuda/solver/batch_bicgstab_kernels.cu | 6 +- cuda/solver/batch_cg_kernels.cu | 6 +- cuda/stop/batch_criteria.cuh | 26 +++++++++ hip/base/batch_multi_vector_kernels.hip.cpp | 56 +++++++++++++++++++ hip/log/batch_logger.hip.hpp | 26 +++++++++ hip/matrix/batch_csr_kernels.hip.cpp | 55 ++++++++++++++++++ hip/matrix/batch_dense_kernels.hip.cpp | 56 +++++++++++++++++++ hip/matrix/batch_ell_kernels.hip.cpp | 55 ++++++++++++++++++ .../batch_jacobi_kernels.hip.cpp | 10 ++-- .../batch_preconditioners.hip.hpp | 2 +- hip/solver/batch_bicgstab_kernels.hip.cpp | 8 +-- hip/solver/batch_cg_kernels.hip.cpp | 8 +-- hip/stop/batch_criteria.hip.hpp | 26 +++++++++ 21 files changed, 580 insertions(+), 32 deletions(-) create mode 100644 cuda/base/batch_multi_vector_kernels.cu create mode 100644 cuda/log/batch_logger.cuh create mode 100644 cuda/matrix/batch_csr_kernels.cu create mode 100644 cuda/matrix/batch_dense_kernels.cu create mode 100644 cuda/matrix/batch_ell_kernels.cu create mode 100644 cuda/stop/batch_criteria.cuh create mode 100644 hip/base/batch_multi_vector_kernels.hip.cpp create mode 100644 hip/log/batch_logger.hip.hpp create mode 100644 hip/matrix/batch_csr_kernels.hip.cpp create mode 100644 hip/matrix/batch_dense_kernels.hip.cpp create mode 100644 hip/matrix/batch_ell_kernels.hip.cpp create mode 100644 hip/stop/batch_criteria.hip.hpp diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp index ce8c4d86e71..8a142a5224a 100644 --- a/core/solver/batch_dispatch.hpp +++ b/core/solver/batch_dispatch.hpp @@ -21,15 +21,15 @@ #include "core/matrix/batch_struct.hpp" -#if defined(GKO_COMPILING_CUDA) +#if defined GKO_COMPILING_CUDA -#include "common/cuda_hip/log/batch_logger.hpp" -#include "common/cuda_hip/stop/batch_criteria.hpp" #include "cuda/base/batch_struct.hpp" #include "cuda/components/cooperative_groups.cuh" +#include "cuda/log/batch_logger.cuh" #include "cuda/matrix/batch_struct.hpp" #include "cuda/preconditioner/batch_preconditioners.cuh" +#include "cuda/stop/batch_criteria.cuh" namespace gko { @@ -37,12 +37,11 @@ namespace batch { namespace solver { -namespace device = gko::kernels::GKO_DEVICE_NAMESPACE; +namespace device = gko::kernels::cuda; template -using DeviceValueType = - typename gko::kernels::GKO_DEVICE_NAMESPACE::device_type; +using DeviceValueType = typename gko::kernels::cuda::cuda_type; } // namespace solver @@ -50,15 +49,15 @@ using DeviceValueType = } // namespace gko -#elif defined(GKO_COMPILING_HIP) +#elif defined GKO_COMPILING_HIP -#include "common/cuda_hip/log/batch_logger.hpp" -#include "common/cuda_hip/stop/batch_criteria.hpp" #include "hip/base/batch_struct.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" +#include "hip/log/batch_logger.hip.hpp" #include "hip/matrix/batch_struct.hip.hpp" #include "hip/preconditioner/batch_preconditioners.hip.hpp" +#include "hip/stop/batch_criteria.hip.hpp" namespace gko { diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu new file mode 100644 index 00000000000..704192d0bff --- /dev/null +++ b/cuda/base/batch_multi_vector_kernels.cu @@ -0,0 +1,56 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/base/batch_multi_vector_kernels.hpp" + +#include +#include + +#include +#include + +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "core/base/batch_struct.hpp" +#include "cuda/base/batch_struct.hpp" +#include "cuda/base/thrust.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/components/uninitialized_array.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The MultiVector matrix format namespace. + * + * @ingroup batch_multi_vector + */ +namespace batch_multi_vector { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" + + +#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_multi_vector +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/log/batch_logger.cuh b/cuda/log/batch_logger.cuh new file mode 100644 index 00000000000..3e53d6ef0a6 --- /dev/null +++ b/cuda/log/batch_logger.cuh @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_CUDA_LOG_BATCH_LOGGER_CUH_ +#define GKO_CUDA_LOG_BATCH_LOGGER_CUH_ + + +#include + + +namespace gko { +namespace kernels { +namespace cuda { +namespace batch_log { + + +#include "common/cuda_hip/log/batch_logger.hpp.inc" + + +} // namespace batch_log +} // namespace cuda +} // namespace kernels +} // namespace gko + + +#endif // GKO_CUDA_LOG_BATCH_LOGGER_CUH_ diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu new file mode 100644 index 00000000000..4fc5137646c --- /dev/null +++ b/cuda/matrix/batch_csr_kernels.cu @@ -0,0 +1,55 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/matrix/batch_csr_kernels.hpp" + +#include + +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "cuda/base/batch_struct.hpp" +#include "cuda/base/thrust.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/components/uninitialized_array.hpp" +#include "cuda/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The Csr matrix format namespace. + * @ref Csr + * @ingroup batch_csr + */ +namespace batch_csr { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" + + +#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_csr +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu new file mode 100644 index 00000000000..e28d4f91670 --- /dev/null +++ b/cuda/matrix/batch_dense_kernels.cu @@ -0,0 +1,56 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/matrix/batch_dense_kernels.hpp" + +#include + +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "cuda/base/batch_struct.hpp" +#include "cuda/base/thrust.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/components/uninitialized_array.hpp" +#include "cuda/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup batch_dense + */ +namespace batch_dense { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" + + +#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" + + +// clang-format on + + +} // namespace batch_dense +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu new file mode 100644 index 00000000000..90caf963200 --- /dev/null +++ b/cuda/matrix/batch_ell_kernels.cu @@ -0,0 +1,55 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/matrix/batch_ell_kernels.hpp" + +#include + +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "cuda/base/batch_struct.hpp" +#include "cuda/base/thrust.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/components/uninitialized_array.hpp" +#include "cuda/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The Ell matrix format namespace. + * @ref Ell + * @ingroup batch_ell + */ +namespace batch_ell { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" + + +#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_ell +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu index e31e17dcafc..1bc39df9781 100644 --- a/cuda/preconditioner/batch_jacobi_kernels.cu +++ b/cuda/preconditioner/batch_jacobi_kernels.cu @@ -8,8 +8,6 @@ #include #include -#include "common/cuda_hip/components/intrinsics.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" #include "core/base/batch_struct.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" @@ -20,6 +18,8 @@ #include "cuda/base/config.hpp" #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/intrinsics.cuh" +#include "cuda/components/thread_ids.cuh" #include "cuda/matrix/batch_struct.hpp" #include "cuda/preconditioner/jacobi_common.hpp" diff --git a/cuda/preconditioner/batch_preconditioners.cuh b/cuda/preconditioner/batch_preconditioners.cuh index 01001c036b2..e83d6e04ee9 100644 --- a/cuda/preconditioner/batch_preconditioners.cuh +++ b/cuda/preconditioner/batch_preconditioners.cuh @@ -7,9 +7,9 @@ #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" #include "core/matrix/batch_struct.hpp" #include "core/preconditioner/batch_jacobi_helpers.hpp" +#include "cuda/components/reduction.cuh" namespace gko { diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 8d76f865a20..28efaf07475 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -14,14 +14,14 @@ #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "cuda/base/batch_struct.hpp" #include "cuda/base/thrust.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/components/uninitialized_array.hpp" #include "cuda/matrix/batch_struct.hpp" diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index 2083cd98b5a..cff72652629 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -13,14 +13,14 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "cuda/base/batch_struct.hpp" #include "cuda/base/thrust.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/components/uninitialized_array.hpp" #include "cuda/matrix/batch_struct.hpp" diff --git a/cuda/stop/batch_criteria.cuh b/cuda/stop/batch_criteria.cuh new file mode 100644 index 00000000000..f4f434dda11 --- /dev/null +++ b/cuda/stop/batch_criteria.cuh @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_CUDA_STOP_BATCH_CRITERIA_CUH_ +#define GKO_CUDA_STOP_BATCH_CRITERIA_CUH_ + + +#include + + +namespace gko { +namespace kernels { +namespace cuda { +namespace batch_stop { + + +#include "common/cuda_hip/stop/batch_criteria.hpp.inc" + + +} // namespace batch_stop +} // namespace cuda +} // namespace kernels +} // namespace gko + +#endif // GKO_CUDA_STOP_BATCH_CRITERIA_CUH_ diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp new file mode 100644 index 00000000000..86b16c8975d --- /dev/null +++ b/hip/base/batch_multi_vector_kernels.hip.cpp @@ -0,0 +1,56 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/base/batch_multi_vector_kernels.hpp" + +#include +#include + +#include +#include + +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "core/base/batch_struct.hpp" +#include "hip/base/batch_struct.hip.hpp" +#include "hip/base/thrust.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The MultiVector matrix format namespace. + * + * @ingroup batch_multi_vector + */ +namespace batch_multi_vector { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" + + +#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_multi_vector +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/log/batch_logger.hip.hpp b/hip/log/batch_logger.hip.hpp new file mode 100644 index 00000000000..a2540f2bd9d --- /dev/null +++ b/hip/log/batch_logger.hip.hpp @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_ +#define GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_ + + +#include + + +namespace gko { +namespace kernels { +namespace hip { +namespace batch_log { + +#include "common/cuda_hip/log/batch_logger.hpp.inc" + + +} // namespace batch_log +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_ diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp new file mode 100644 index 00000000000..4b0e6799834 --- /dev/null +++ b/hip/matrix/batch_csr_kernels.hip.cpp @@ -0,0 +1,55 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/matrix/batch_csr_kernels.hpp" + +#include + +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "hip/base/batch_struct.hip.hpp" +#include "hip/base/thrust.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" +#include "hip/matrix/batch_struct.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Csr matrix format namespace. + * @ref Csr + * @ingroup batch_csr + */ +namespace batch_csr { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" + + +#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_csr +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp new file mode 100644 index 00000000000..328f268251f --- /dev/null +++ b/hip/matrix/batch_dense_kernels.hip.cpp @@ -0,0 +1,56 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/matrix/batch_dense_kernels.hpp" + +#include + +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "hip/base/batch_struct.hip.hpp" +#include "hip/base/thrust.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" +#include "hip/matrix/batch_struct.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup batch_dense + */ +namespace batch_dense { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" + + +#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" + + +// clang-format on + + +} // namespace batch_dense +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp new file mode 100644 index 00000000000..01294ac3d63 --- /dev/null +++ b/hip/matrix/batch_ell_kernels.hip.cpp @@ -0,0 +1,55 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/matrix/batch_ell_kernels.hpp" + +#include + +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "hip/base/batch_struct.hip.hpp" +#include "hip/base/thrust.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" +#include "hip/matrix/batch_struct.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Ell matrix format namespace. + * @ref Ell + * @ingroup batch_ell + */ +namespace batch_ell { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" + + +#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_ell +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp index a112e3beb92..db6e5a27b58 100644 --- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp +++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp @@ -8,11 +8,6 @@ #include #include -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/components/diagonal_block_manipulation.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/batch_struct.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" @@ -20,8 +15,13 @@ #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/batch_struct.hip.hpp" #include "hip/base/config.hip.hpp" +#include "hip/base/math.hip.hpp" #include "hip/base/types.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/diagonal_block_manipulation.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" +#include "hip/components/warp_blas.hip.hpp" #include "hip/matrix/batch_struct.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" diff --git a/hip/preconditioner/batch_preconditioners.hip.hpp b/hip/preconditioner/batch_preconditioners.hip.hpp index f62000ff46f..f3969c16b81 100644 --- a/hip/preconditioner/batch_preconditioners.hip.hpp +++ b/hip/preconditioner/batch_preconditioners.hip.hpp @@ -7,9 +7,9 @@ #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" #include "core/matrix/batch_struct.hpp" #include "core/preconditioner/batch_jacobi_helpers.hpp" +#include "hip/components/reduction.hip.hpp" namespace gko { diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index 96587f8479e..44e2f0f3c48 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -11,18 +11,18 @@ #include #include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "hip/base/batch_struct.hip.hpp" +#include "hip/base/math.hip.hpp" #include "hip/base/thrust.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" #include "hip/matrix/batch_struct.hip.hpp" diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index e12445b2c84..450d02a302c 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -11,18 +11,18 @@ #include #include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "hip/base/batch_struct.hip.hpp" +#include "hip/base/math.hip.hpp" #include "hip/base/thrust.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" #include "hip/matrix/batch_struct.hip.hpp" diff --git a/hip/stop/batch_criteria.hip.hpp b/hip/stop/batch_criteria.hip.hpp new file mode 100644 index 00000000000..1f721e36aaf --- /dev/null +++ b/hip/stop/batch_criteria.hip.hpp @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_ +#define GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_ + + +#include + + +namespace gko { +namespace kernels { +namespace hip { +namespace batch_stop { + + +#include "common/cuda_hip/stop/batch_criteria.hpp.inc" + + +} // namespace batch_stop +} // namespace hip +} // namespace kernels +} // namespace gko + +#endif // GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_ From 9d76d0f69ba894b8b9cffad18d36ff65f1510a2b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 29 Jun 2024 19:26:35 +0200 Subject: [PATCH 044/448] fix batch --- common/cuda_hip/CMakeLists.txt | 4 -- ...cpp => batch_multi_vector_kernels.hpp.inc} | 52 ------------------- ...{batch_logger.hpp => batch_logger.hpp.inc} | 22 -------- ..._kernels.cpp => batch_csr_kernels.hpp.inc} | 51 ------------------ ...ernels.cpp => batch_dense_kernels.hpp.inc} | 52 ------------------- ..._kernels.cpp => batch_ell_kernels.hpp.inc} | 51 ------------------ .../preconditioner/jacobi_kernels.cpp | 2 +- ...ch_criteria.hpp => batch_criteria.hpp.inc} | 21 -------- cuda/CMakeLists.txt | 6 ++- cuda/base/batch_multi_vector_kernels.cu | 8 +-- cuda/matrix/batch_csr_kernels.cu | 8 +-- cuda/matrix/batch_dense_kernels.cu | 8 +-- cuda/matrix/batch_ell_kernels.cu | 8 +-- cuda/preconditioner/batch_jacobi_kernels.cu | 6 +-- cuda/preconditioner/batch_preconditioners.cuh | 2 +- .../jacobi_advanced_apply_kernels.cu | 2 +- ...cobi_advanced_apply_kernels.instantiate.cu | 2 +- .../preconditioner/jacobi_generate_kernels.cu | 2 +- .../jacobi_generate_kernels.instantiate.cu | 2 +- .../jacobi_simple_apply_kernels.cu | 2 +- ...jacobi_simple_apply_kernels.instantiate.cu | 2 +- cuda/solver/batch_bicgstab_kernels.cu | 9 ++-- cuda/solver/batch_cg_kernels.cu | 9 ++-- cuda/test/components/merging.cu | 2 +- cuda/test/components/searching.cu | 2 +- cuda/test/components/sorting.cu | 2 +- hip/CMakeLists.txt | 8 ++- hip/base/batch_multi_vector_kernels.hip.cpp | 8 +-- hip/matrix/batch_csr_kernels.hip.cpp | 8 +-- hip/matrix/batch_dense_kernels.hip.cpp | 8 +-- hip/matrix/batch_ell_kernels.hip.cpp | 8 +-- .../batch_jacobi_kernels.hip.cpp | 12 ++--- .../batch_preconditioners.hip.hpp | 2 +- .../jacobi_advanced_apply_kernels.hip.cpp | 2 +- ...advanced_apply_kernels.instantiate.hip.cpp | 2 +- .../jacobi_generate_kernels.hip.cpp | 2 +- ...acobi_generate_kernels.instantiate.hip.cpp | 2 +- .../jacobi_simple_apply_kernels.hip.cpp | 2 +- ...i_simple_apply_kernels.instantiate.hip.cpp | 2 +- hip/solver/batch_bicgstab_kernels.hip.cpp | 11 ++-- hip/solver/batch_cg_kernels.hip.cpp | 11 ++-- hip/test/components/merging.hip.cpp | 2 +- hip/test/components/searching.hip.cpp | 2 +- hip/test/components/sorting.hip.cpp | 2 +- 44 files changed, 91 insertions(+), 340 deletions(-) rename common/cuda_hip/base/{batch_multi_vector_kernels.cpp => batch_multi_vector_kernels.hpp.inc} (89%) rename common/cuda_hip/log/{batch_logger.hpp => batch_logger.hpp.inc} (67%) rename common/cuda_hip/matrix/{batch_csr_kernels.cpp => batch_csr_kernels.hpp.inc} (87%) rename common/cuda_hip/matrix/{batch_dense_kernels.cpp => batch_dense_kernels.hpp.inc} (89%) rename common/cuda_hip/matrix/{batch_ell_kernels.cpp => batch_ell_kernels.hpp.inc} (87%) rename common/cuda_hip/stop/{batch_criteria.hpp => batch_criteria.hpp.inc} (75%) diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index 79af0c5fd0d..0225e3ad872 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -1,6 +1,5 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) set(CUDA_HIP_SOURCES - base/batch_multi_vector_kernels.cpp base/device_matrix_data_kernels.cpp components/prefix_sum_kernels.cpp distributed/matrix_kernels.cpp @@ -12,9 +11,6 @@ set(CUDA_HIP_SOURCES factorization/lu_kernels.cpp factorization/par_ic_kernels.cpp factorization/par_ilu_kernels.cpp - matrix/batch_csr_kernels.cpp - matrix/batch_dense_kernels.cpp - matrix/batch_ell_kernels.cpp matrix/coo_kernels.cpp matrix/dense_kernels.cpp matrix/diagonal_kernels.cpp diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.cpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc similarity index 89% rename from common/cuda_hip/base/batch_multi_vector_kernels.cpp rename to common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc index 0261dbb97ce..9b6301674be 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.cpp +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -2,47 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/base/batch_multi_vector_kernels.hpp" - -#include -#include - -#include -#include - -#include "common/cuda_hip/base/batch_struct.hpp" -#include "common/cuda_hip/base/blas_bindings.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/pointer_mode_guard.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "core/base/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -/** - * @brief The MultiVector matrix format namespace. - * - * @ingroup batch_multi_vector - */ -namespace batch_multi_vector { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - - template __device__ __forceinline__ void scale( const gko::batch::multi_vector::batch_item& alpha, @@ -340,14 +299,3 @@ __launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel( copy(src_b, dst_b); } } - - -#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_multi_vector -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/log/batch_logger.hpp b/common/cuda_hip/log/batch_logger.hpp.inc similarity index 67% rename from common/cuda_hip/log/batch_logger.hpp rename to common/cuda_hip/log/batch_logger.hpp.inc index bca07fb9c37..04b614b50f9 100644 --- a/common/cuda_hip/log/batch_logger.hpp +++ b/common/cuda_hip/log/batch_logger.hpp.inc @@ -2,19 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_INC_ -#define GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_INC_ - - -#include - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -namespace batch_log { - - /** * @see reference/log/batch_logger.hpp */ @@ -41,12 +28,3 @@ class SimpleFinalLogger final { real_type* const final_residuals_; idx_type* const final_iters_; }; - - -} // namespace batch_log -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko - - -#endif // GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_INC_ \ No newline at end of file diff --git a/common/cuda_hip/matrix/batch_csr_kernels.cpp b/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc similarity index 87% rename from common/cuda_hip/matrix/batch_csr_kernels.cpp rename to common/cuda_hip/matrix/batch_csr_kernels.hpp.inc index 01edb0e1310..e041dadaa3e 100644 --- a/common/cuda_hip/matrix/batch_csr_kernels.cpp +++ b/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc @@ -2,46 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/matrix/batch_csr_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/batch_struct.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "common/cuda_hip/matrix/batch_struct.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -/** - * @brief The Csr matrix format namespace. - * @ref Csr - * @ingroup batch_csr - */ -namespace batch_csr { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - - template __device__ __forceinline__ void simple_apply( const gko::batch::matrix::csr::batch_item& mat, @@ -236,14 +196,3 @@ __global__ void add_scaled_identity_kernel( add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b); } } - - -#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_csr -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/matrix/batch_dense_kernels.cpp b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc similarity index 89% rename from common/cuda_hip/matrix/batch_dense_kernels.cpp rename to common/cuda_hip/matrix/batch_dense_kernels.hpp.inc index 90cafc5d1ca..f8abf9131a1 100644 --- a/common/cuda_hip/matrix/batch_dense_kernels.cpp +++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc @@ -2,46 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/matrix/batch_dense_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/batch_struct.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "common/cuda_hip/matrix/batch_struct.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -/** - * @brief The Dense matrix format namespace. - * - * @ingroup batch_dense - */ -namespace batch_dense { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - - template __device__ __forceinline__ void simple_apply( const gko::batch::matrix::dense::batch_item& mat, @@ -283,15 +243,3 @@ __global__ void add_scaled_identity_kernel( add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b); } } - - -#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" - - -// clang-format on - - -} // namespace batch_dense -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/matrix/batch_ell_kernels.cpp b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc similarity index 87% rename from common/cuda_hip/matrix/batch_ell_kernels.cpp rename to common/cuda_hip/matrix/batch_ell_kernels.hpp.inc index c5e27e9d1d1..0a6d1927c96 100644 --- a/common/cuda_hip/matrix/batch_ell_kernels.cpp +++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc @@ -2,46 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/matrix/batch_ell_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/batch_struct.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "common/cuda_hip/matrix/batch_struct.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -/** - * @brief The Ell matrix format namespace. - * @ref Ell - * @ingroup batch_ell - */ -namespace batch_ell { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - - template __device__ __forceinline__ void simple_apply( const gko::batch::matrix::ell::batch_item& mat, @@ -245,14 +205,3 @@ __global__ void add_scaled_identity_kernel( add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b); } } - - -#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_ell -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_kernels.cpp index 27069d2f693..3c581546be2 100644 --- a/common/cuda_hip/preconditioner/jacobi_kernels.cpp +++ b/common/cuda_hip/preconditioner/jacobi_kernels.cpp @@ -12,10 +12,10 @@ #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/preconditioner/jacobi_common.hpp" #include "core/base/extended_float.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" +#include "preconditioner/jacobi_common.hpp" namespace gko { diff --git a/common/cuda_hip/stop/batch_criteria.hpp b/common/cuda_hip/stop/batch_criteria.hpp.inc similarity index 75% rename from common/cuda_hip/stop/batch_criteria.hpp rename to common/cuda_hip/stop/batch_criteria.hpp.inc index cecaa6b19d1..38072467765 100644 --- a/common/cuda_hip/stop/batch_criteria.hpp +++ b/common/cuda_hip/stop/batch_criteria.hpp.inc @@ -2,19 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_INC_ -#define GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_INC_ - - -#include - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -namespace batch_stop { - - /** * @see reference/stop/batch_criteria.hpp */ @@ -62,11 +49,3 @@ class SimpleAbsResidual { private: const real_type abs_tol_; }; - - -} // namespace batch_stop -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko - -#endif // GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_INC_ \ No newline at end of file diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 4dd7bccd2c9..1552f4f3ee5 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -7,6 +7,7 @@ add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kerne list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_cuda PRIVATE + base/batch_multi_vector_kernels.cu base/device.cpp base/exception.cpp base/executor.cpp @@ -27,6 +28,9 @@ target_sources(ginkgo_cuda factorization/par_ilut_select_kernels.cu factorization/par_ilut_spgeam_kernels.cu factorization/par_ilut_sweep_kernels.cu + matrix/batch_csr_kernels.cu + matrix/batch_dense_kernels.cu + matrix/batch_ell_kernels.cu ${CSR_INSTANTIATE} ${FBCSR_INSTANTIATE} matrix/fft_kernels.cu @@ -97,7 +101,7 @@ target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAM # include path for generated headers like jacobi_common.hpp target_include_directories(ginkgo_cuda - PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/..) + PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) target_link_libraries(ginkgo_cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cusparse CUDA::curand CUDA::cufft nvtx::nvtx) # NVTX3 is header-only and requires dlopen/dlclose in static builds target_link_libraries(ginkgo_cuda PUBLIC ginkgo_device ${CMAKE_DL_LIBS}) diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu index 704192d0bff..3dad5ba94f1 100644 --- a/cuda/base/batch_multi_vector_kernels.cu +++ b/cuda/base/batch_multi_vector_kernels.cu @@ -14,13 +14,13 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/pointer_mode_guard.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/batch_struct.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" namespace gko { diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu index 4fc5137646c..95b4f85cdfc 100644 --- a/cuda/matrix/batch_csr_kernels.cu +++ b/cuda/matrix/batch_csr_kernels.cu @@ -12,14 +12,14 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" #include "cuda/matrix/batch_struct.hpp" diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu index e28d4f91670..10148ee242b 100644 --- a/cuda/matrix/batch_dense_kernels.cu +++ b/cuda/matrix/batch_dense_kernels.cu @@ -12,14 +12,14 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" #include "cuda/matrix/batch_struct.hpp" diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu index 90caf963200..25281cf6f81 100644 --- a/cuda/matrix/batch_ell_kernels.cu +++ b/cuda/matrix/batch_ell_kernels.cu @@ -12,14 +12,14 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" #include "cuda/matrix/batch_struct.hpp" diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu index 1bc39df9781..178b53d04ea 100644 --- a/cuda/preconditioner/batch_jacobi_kernels.cu +++ b/cuda/preconditioner/batch_jacobi_kernels.cu @@ -8,6 +8,8 @@ #include #include +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/base/batch_struct.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" @@ -18,10 +20,8 @@ #include "cuda/base/config.hpp" #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/thread_ids.cuh" #include "cuda/matrix/batch_struct.hpp" -#include "cuda/preconditioner/jacobi_common.hpp" +#include "preconditioner/jacobi_common.hpp" namespace gko { diff --git a/cuda/preconditioner/batch_preconditioners.cuh b/cuda/preconditioner/batch_preconditioners.cuh index e83d6e04ee9..01001c036b2 100644 --- a/cuda/preconditioner/batch_preconditioners.cuh +++ b/cuda/preconditioner/batch_preconditioners.cuh @@ -7,9 +7,9 @@ #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" #include "core/matrix/batch_struct.hpp" #include "core/preconditioner/batch_jacobi_helpers.hpp" -#include "cuda/components/reduction.cuh" namespace gko { diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.cu index 74c7dea9b6b..fca6b24ba05 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_kernels.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.cu @@ -7,7 +7,7 @@ #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/preconditioner/jacobi_common.hpp" +#include "preconditioner/jacobi_common.hpp" namespace gko { diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu index 60823cf6f4b..80c3b5e1e73 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu @@ -15,7 +15,7 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/preconditioner/jacobi_common.hpp" +#include "preconditioner/jacobi_common.hpp" namespace gko { diff --git a/cuda/preconditioner/jacobi_generate_kernels.cu b/cuda/preconditioner/jacobi_generate_kernels.cu index 651dcec611a..e558594f5ce 100644 --- a/cuda/preconditioner/jacobi_generate_kernels.cu +++ b/cuda/preconditioner/jacobi_generate_kernels.cu @@ -8,7 +8,7 @@ #include "core/components/fill_array_kernels.hpp" #include "core/preconditioner/jacobi_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/preconditioner/jacobi_common.hpp" +#include "preconditioner/jacobi_common.hpp" namespace gko { diff --git a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu index ff36c8efb1b..0dc21311af9 100644 --- a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu +++ b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu @@ -18,7 +18,7 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/preconditioner/jacobi_common.hpp" +#include "preconditioner/jacobi_common.hpp" namespace gko { diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.cu index 5cac209b8b2..0bb09b1064a 100644 --- a/cuda/preconditioner/jacobi_simple_apply_kernels.cu +++ b/cuda/preconditioner/jacobi_simple_apply_kernels.cu @@ -7,7 +7,7 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/preconditioner/jacobi_common.hpp" +#include "preconditioner/jacobi_common.hpp" namespace gko { diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu index d727c9439f9..0721c03126b 100644 --- a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu +++ b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu @@ -15,7 +15,7 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/preconditioner/jacobi_common.hpp" +#include "preconditioner/jacobi_common.hpp" namespace gko { diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 28efaf07475..6b3dca28607 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -12,16 +12,16 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" #include "cuda/matrix/batch_struct.hpp" @@ -44,7 +44,6 @@ namespace batch_bicgstab { #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" -#include "common/cuda_hip/components/uninitialized_array.hpp.inc" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index cff72652629..746be0365e7 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -11,16 +11,16 @@ #include #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" #include "cuda/matrix/batch_struct.hpp" @@ -43,7 +43,6 @@ namespace batch_cg { #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" -#include "common/cuda_hip/components/uninitialized_array.hpp.inc" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" diff --git a/cuda/test/components/merging.cu b/cuda/test/components/merging.cu index 2788767b078..0a66c92ca3a 100644 --- a/cuda/test/components/merging.cu +++ b/cuda/test/components/merging.cu @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "cuda/components/merging.cuh" +#include "common/cuda_hip/components/merging.hpp" #include #include diff --git a/cuda/test/components/searching.cu b/cuda/test/components/searching.cu index afe7fb4b442..d0166418448 100644 --- a/cuda/test/components/searching.cu +++ b/cuda/test/components/searching.cu @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "cuda/components/searching.cuh" +#include "common/cuda_hip/components/searching.hpp" #include #include diff --git a/cuda/test/components/sorting.cu b/cuda/test/components/sorting.cu index e1524ce0078..0cc54e5904e 100644 --- a/cuda/test/components/sorting.cu +++ b/cuda/test/components/sorting.cu @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "cuda/components/sorting.cuh" +#include "common/cuda_hip/components/sorting.hpp" #include #include diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 3de4f4b4d65..71d41ad47df 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -5,6 +5,7 @@ add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kerne # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) set(GINKGO_HIP_SOURCES + base/batch_multi_vector_kernels.hip.cpp base/device.hip.cpp base/exception.hip.cpp base/executor.hip.cpp @@ -25,6 +26,9 @@ set(GINKGO_HIP_SOURCES factorization/par_ilut_select_kernels.hip.cpp factorization/par_ilut_spgeam_kernels.hip.cpp factorization/par_ilut_sweep_kernels.hip.cpp + matrix/batch_csr_kernels.hip.cpp + matrix/batch_dense_kernels.hip.cpp + matrix/batch_ell_kernels.hip.cpp ${CSR_INSTANTIATE} ${FBCSR_INSTANTIATE} preconditioner/batch_jacobi_kernels.hip.cpp @@ -83,14 +87,14 @@ foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_HIP_JACOBI_BLOCK_SIZES) ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) endforeach() string(REPLACE ";" "," GKO_HIP_JACOBI_BLOCK_SIZES_CODE "${GKO_HIP_JACOBI_BLOCK_SIZES}") -configure_file(preconditioner/jacobi_common.hip.hpp.in preconditioner/jacobi_common.hip.hpp) +configure_file(preconditioner/jacobi_common.hip.hpp.in preconditioner/jacobi_common.hpp) set_source_files_properties(${GINKGO_HIP_SOURCES} PROPERTIES LANGUAGE HIP) add_library(ginkgo_hip $ ${GINKGO_HIP_SOURCES}) target_include_directories(ginkgo_hip PRIVATE - ${CMAKE_CURRENT_BINARY_DIR}/.. # for generated headers like jacobi_common.hip.hpp + ${CMAKE_CURRENT_BINARY_DIR} # for generated headers like jacobi_common.hip.hpp ) target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip) diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp index 86b16c8975d..701f4655a9a 100644 --- a/hip/base/batch_multi_vector_kernels.hip.cpp +++ b/hip/base/batch_multi_vector_kernels.hip.cpp @@ -14,13 +14,13 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/pointer_mode_guard.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/batch_struct.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" namespace gko { diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp index 4b0e6799834..b77b9416505 100644 --- a/hip/matrix/batch_csr_kernels.hip.cpp +++ b/hip/matrix/batch_csr_kernels.hip.cpp @@ -12,14 +12,14 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" #include "hip/matrix/batch_struct.hip.hpp" diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp index 328f268251f..67dfd78e264 100644 --- a/hip/matrix/batch_dense_kernels.hip.cpp +++ b/hip/matrix/batch_dense_kernels.hip.cpp @@ -12,14 +12,14 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" #include "hip/matrix/batch_struct.hip.hpp" diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp index 01294ac3d63..68b59c042f1 100644 --- a/hip/matrix/batch_ell_kernels.hip.cpp +++ b/hip/matrix/batch_ell_kernels.hip.cpp @@ -12,14 +12,14 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" #include "hip/matrix/batch_struct.hip.hpp" diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp index db6e5a27b58..cfef615dcad 100644 --- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp +++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp @@ -8,6 +8,11 @@ #include #include +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/diagonal_block_manipulation.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/batch_struct.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" @@ -15,15 +20,10 @@ #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/batch_struct.hip.hpp" #include "hip/base/config.hip.hpp" -#include "hip/base/math.hip.hpp" #include "hip/base/types.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/diagonal_block_manipulation.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" -#include "hip/components/warp_blas.hip.hpp" #include "hip/matrix/batch_struct.hip.hpp" -#include "hip/preconditioner/jacobi_common.hip.hpp" +#include "preconditioner/jacobi_common.hpp" namespace gko { diff --git a/hip/preconditioner/batch_preconditioners.hip.hpp b/hip/preconditioner/batch_preconditioners.hip.hpp index f3969c16b81..f62000ff46f 100644 --- a/hip/preconditioner/batch_preconditioners.hip.hpp +++ b/hip/preconditioner/batch_preconditioners.hip.hpp @@ -7,9 +7,9 @@ #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" #include "core/matrix/batch_struct.hpp" #include "core/preconditioner/batch_jacobi_helpers.hpp" -#include "hip/components/reduction.hip.hpp" namespace gko { diff --git a/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp index 0eccbb2d6eb..ce260ec1e16 100644 --- a/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp @@ -7,7 +7,7 @@ #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/preconditioner/jacobi_common.hip.hpp" +#include "preconditioner/jacobi_common.hpp" namespace gko { diff --git a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp index d30f4edd787..9cc4978a1f8 100644 --- a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp +++ b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp @@ -16,7 +16,7 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/preconditioner/jacobi_common.hip.hpp" +#include "preconditioner/jacobi_common.hpp" namespace gko { diff --git a/hip/preconditioner/jacobi_generate_kernels.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.hip.cpp index 3f6d3a4e91f..673ca8c373e 100644 --- a/hip/preconditioner/jacobi_generate_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_generate_kernels.hip.cpp @@ -19,7 +19,7 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/preconditioner/jacobi_common.hip.hpp" +#include "preconditioner/jacobi_common.hpp" namespace gko { diff --git a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp index 3c18703557d..a6be610a839 100644 --- a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp +++ b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp @@ -18,7 +18,7 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/preconditioner/jacobi_common.hip.hpp" +#include "preconditioner/jacobi_common.hpp" namespace gko { diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp index 563f5829536..72f2e4fe556 100644 --- a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp @@ -16,7 +16,7 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/preconditioner/jacobi_common.hip.hpp" +#include "preconditioner/jacobi_common.hpp" namespace gko { diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp index 7a6e2a46b04..1ea34bff93f 100644 --- a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp @@ -15,7 +15,7 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/preconditioner/jacobi_common.hip.hpp" +#include "preconditioner/jacobi_common.hpp" namespace gko { diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index 44e2f0f3c48..92051a81640 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -11,18 +11,18 @@ #include #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" #include "hip/matrix/batch_struct.hip.hpp" @@ -43,7 +43,6 @@ namespace batch_bicgstab { #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" -#include "common/cuda_hip/components/uninitialized_array.hpp.inc" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index 450d02a302c..2df02a6f0a8 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -11,18 +11,18 @@ #include #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" #include "hip/matrix/batch_struct.hip.hpp" @@ -43,7 +43,6 @@ namespace batch_cg { #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" -#include "common/cuda_hip/components/uninitialized_array.hpp.inc" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp index 7fc3b9a173a..5c8cc21bd5e 100644 --- a/hip/test/components/merging.hip.cpp +++ b/hip/test/components/merging.hip.cpp @@ -8,7 +8,7 @@ // clang-format on -#include "hip/components/merging.hip.hpp" +#include "common/cuda_hip/components/merging.hpp" #include #include diff --git a/hip/test/components/searching.hip.cpp b/hip/test/components/searching.hip.cpp index 85c54075231..d9dc6b47ab0 100644 --- a/hip/test/components/searching.hip.cpp +++ b/hip/test/components/searching.hip.cpp @@ -8,7 +8,7 @@ // clang-format on -#include "hip/components/searching.hip.hpp" +#include "common/cuda_hip/components/searching.hpp" #include #include diff --git a/hip/test/components/sorting.hip.cpp b/hip/test/components/sorting.hip.cpp index 79de1dc2269..653a0f536eb 100644 --- a/hip/test/components/sorting.hip.cpp +++ b/hip/test/components/sorting.hip.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "hip/components/sorting.hip.hpp" +#include "common/cuda_hip/components/sorting.hpp" #include #include From d53fa460c62c0c756f578ab46d93d2173310fb57 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 29 Jun 2024 20:04:09 +0200 Subject: [PATCH 045/448] add newlines to the end of files where missing --- common/cuda_hip/base/device_matrix_data_kernels.cpp | 2 +- common/cuda_hip/base/kernel_launch.hpp | 2 +- common/cuda_hip/base/kernel_launch_reduction.hpp | 2 +- common/cuda_hip/base/kernel_launch_solver.hpp | 2 +- common/cuda_hip/base/math.hpp | 2 +- common/cuda_hip/components/atomic.hpp | 2 +- common/cuda_hip/components/diagonal_block_manipulation.hpp | 2 +- common/cuda_hip/components/intrinsics.hpp | 2 +- common/cuda_hip/components/merging.hpp | 2 +- common/cuda_hip/components/prefix_sum.hpp | 2 +- common/cuda_hip/components/prefix_sum_kernels.cpp | 2 +- common/cuda_hip/components/reduction.hpp | 2 +- common/cuda_hip/components/searching.hpp | 2 +- common/cuda_hip/components/segment_scan.hpp | 2 +- common/cuda_hip/components/sorting.hpp | 2 +- common/cuda_hip/components/syncfree.hpp | 2 +- common/cuda_hip/components/thread_ids.hpp | 2 +- common/cuda_hip/components/uninitialized_array.hpp | 2 +- common/cuda_hip/components/warp_blas.hpp | 2 +- common/cuda_hip/distributed/matrix_kernels.cpp | 2 +- common/cuda_hip/distributed/partition_helpers_kernels.cpp | 2 +- common/cuda_hip/distributed/partition_kernels.cpp | 2 +- common/cuda_hip/distributed/vector_kernels.cpp | 2 +- common/cuda_hip/factorization/cholesky_kernels.cpp | 2 +- common/cuda_hip/factorization/factorization_kernels.cpp | 2 +- common/cuda_hip/factorization/lu_kernels.cpp | 2 +- common/cuda_hip/factorization/par_ic_kernels.cpp | 2 +- common/cuda_hip/factorization/par_ilu_kernels.cpp | 2 +- common/cuda_hip/matrix/coo_kernels.cpp | 2 +- common/cuda_hip/matrix/dense_kernels.cpp | 2 +- common/cuda_hip/matrix/diagonal_kernels.cpp | 2 +- common/cuda_hip/matrix/ell_kernels.cpp | 2 +- common/cuda_hip/matrix/fbcsr_kernels.template.cpp | 2 +- common/cuda_hip/matrix/sellp_kernels.cpp | 2 +- common/cuda_hip/matrix/sparsity_csr_kernels.cpp | 2 +- common/cuda_hip/multigrid/pgm_kernels.cpp | 2 +- common/cuda_hip/preconditioner/isai_kernels.cpp | 2 +- common/cuda_hip/preconditioner/jacobi_kernels.cpp | 2 +- common/cuda_hip/reorder/rcm_kernels.cpp | 2 +- common/cuda_hip/solver/cb_gmres_kernels.cpp | 2 +- common/cuda_hip/solver/idr_kernels.cpp | 2 +- common/cuda_hip/solver/multigrid_kernels.cpp | 2 +- 42 files changed, 42 insertions(+), 42 deletions(-) diff --git a/common/cuda_hip/base/device_matrix_data_kernels.cpp b/common/cuda_hip/base/device_matrix_data_kernels.cpp index 61a7a6281a9..c5742653a93 100644 --- a/common/cuda_hip/base/device_matrix_data_kernels.cpp +++ b/common/cuda_hip/base/device_matrix_data_kernels.cpp @@ -124,4 +124,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace components } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/base/kernel_launch.hpp b/common/cuda_hip/base/kernel_launch.hpp index dd20eb5769f..04c54786422 100644 --- a/common/cuda_hip/base/kernel_launch.hpp +++ b/common/cuda_hip/base/kernel_launch.hpp @@ -102,4 +102,4 @@ void run_kernel(std::shared_ptr exec, KernelFunction fn, } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/base/kernel_launch_reduction.hpp b/common/cuda_hip/base/kernel_launch_reduction.hpp index 86e082ac2c1..4c4fb366802 100644 --- a/common/cuda_hip/base/kernel_launch_reduction.hpp +++ b/common/cuda_hip/base/kernel_launch_reduction.hpp @@ -527,4 +527,4 @@ void run_kernel_col_reduction_cached( } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/base/kernel_launch_solver.hpp b/common/cuda_hip/base/kernel_launch_solver.hpp index 742da85fd96..e32ba52e79a 100644 --- a/common/cuda_hip/base/kernel_launch_solver.hpp +++ b/common/cuda_hip/base/kernel_launch_solver.hpp @@ -50,4 +50,4 @@ void run_kernel_solver(std::shared_ptr exec, } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp index 44a26cadb53..ea11c7d73a9 100644 --- a/common/cuda_hip/base/math.hpp +++ b/common/cuda_hip/base/math.hpp @@ -54,4 +54,4 @@ struct truncate_type_impl> { } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_INC_ \ No newline at end of file +#endif // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_INC_ diff --git a/common/cuda_hip/components/atomic.hpp b/common/cuda_hip/components/atomic.hpp index e0384222734..3279c9433f1 100644 --- a/common/cuda_hip/components/atomic.hpp +++ b/common/cuda_hip/components/atomic.hpp @@ -250,4 +250,4 @@ __forceinline__ __device__ thrust::complex atomic_add( } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_INC_ \ No newline at end of file +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_INC_ diff --git a/common/cuda_hip/components/diagonal_block_manipulation.hpp b/common/cuda_hip/components/diagonal_block_manipulation.hpp index 5c0be150d21..890d080018e 100644 --- a/common/cuda_hip/components/diagonal_block_manipulation.hpp +++ b/common/cuda_hip/components/diagonal_block_manipulation.hpp @@ -88,4 +88,4 @@ __device__ __forceinline__ void extract_transposed_diag_blocks( } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_INC_ \ No newline at end of file +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_INC_ diff --git a/common/cuda_hip/components/intrinsics.hpp b/common/cuda_hip/components/intrinsics.hpp index 398e4325cc2..e8c236e22b1 100644 --- a/common/cuda_hip/components/intrinsics.hpp +++ b/common/cuda_hip/components/intrinsics.hpp @@ -55,4 +55,4 @@ __forceinline__ __device__ int clz(uint64 mask) { return __clzll(mask); } } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_INC_ \ No newline at end of file +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_INC_ diff --git a/common/cuda_hip/components/merging.hpp b/common/cuda_hip/components/merging.hpp index b1bca2a0c78..4c1bfa4cd2d 100644 --- a/common/cuda_hip/components/merging.hpp +++ b/common/cuda_hip/components/merging.hpp @@ -302,4 +302,4 @@ __forceinline__ __device__ void sequential_match(const ValueType* a, } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_INC_ \ No newline at end of file +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_INC_ diff --git a/common/cuda_hip/components/prefix_sum.hpp b/common/cuda_hip/components/prefix_sum.hpp index 8fc5bbe63b0..a09eb8f17c5 100644 --- a/common/cuda_hip/components/prefix_sum.hpp +++ b/common/cuda_hip/components/prefix_sum.hpp @@ -182,4 +182,4 @@ __global__ __launch_bounds__(block_size) void finalize_prefix_sum( } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_INC_ \ No newline at end of file +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_INC_ diff --git a/common/cuda_hip/components/prefix_sum_kernels.cpp b/common/cuda_hip/components/prefix_sum_kernels.cpp index 40cb1bc48fc..ebf102a7181 100644 --- a/common/cuda_hip/components/prefix_sum_kernels.cpp +++ b/common/cuda_hip/components/prefix_sum_kernels.cpp @@ -80,4 +80,4 @@ template void prefix_sum_nonnegative( } // namespace components } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/components/reduction.hpp b/common/cuda_hip/components/reduction.hpp index d2889bb9c7e..582de3de1fb 100644 --- a/common/cuda_hip/components/reduction.hpp +++ b/common/cuda_hip/components/reduction.hpp @@ -296,4 +296,4 @@ __host__ ValueType reduce_add_array(std::shared_ptr exec, } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_INC_ \ No newline at end of file +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_INC_ diff --git a/common/cuda_hip/components/searching.hpp b/common/cuda_hip/components/searching.hpp index 599e7a8581c..61efde54197 100644 --- a/common/cuda_hip/components/searching.hpp +++ b/common/cuda_hip/components/searching.hpp @@ -228,4 +228,4 @@ __forceinline__ __device__ IndexType group_ary_search(IndexType offset, } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_INC_ \ No newline at end of file +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_INC_ diff --git a/common/cuda_hip/components/segment_scan.hpp b/common/cuda_hip/components/segment_scan.hpp index d2f992850ef..af3953a4176 100644 --- a/common/cuda_hip/components/segment_scan.hpp +++ b/common/cuda_hip/components/segment_scan.hpp @@ -52,4 +52,4 @@ __device__ __forceinline__ bool segment_scan( } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_INC_ \ No newline at end of file +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_INC_ diff --git a/common/cuda_hip/components/sorting.hpp b/common/cuda_hip/components/sorting.hpp index ecc9c5289f9..b3ce253b451 100644 --- a/common/cuda_hip/components/sorting.hpp +++ b/common/cuda_hip/components/sorting.hpp @@ -311,4 +311,4 @@ __forceinline__ __device__ void bitonic_sort(ValueType* local_elements, } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_INC_ \ No newline at end of file +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_INC_ diff --git a/common/cuda_hip/components/syncfree.hpp b/common/cuda_hip/components/syncfree.hpp index 3c82c916a21..e1693fe4e4d 100644 --- a/common/cuda_hip/components/syncfree.hpp +++ b/common/cuda_hip/components/syncfree.hpp @@ -135,4 +135,4 @@ class syncfree_scheduler { } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_INC_ \ No newline at end of file +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_INC_ diff --git a/common/cuda_hip/components/thread_ids.hpp b/common/cuda_hip/components/thread_ids.hpp index 4fef650f51c..7d7c5e2bda3 100644 --- a/common/cuda_hip/components/thread_ids.hpp +++ b/common/cuda_hip/components/thread_ids.hpp @@ -263,4 +263,4 @@ __device__ __forceinline__ IndexType get_subwarp_num_flat() } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_INC_ \ No newline at end of file +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_INC_ diff --git a/common/cuda_hip/components/uninitialized_array.hpp b/common/cuda_hip/components/uninitialized_array.hpp index 215c7f5751a..d4a2b5939af 100644 --- a/common/cuda_hip/components/uninitialized_array.hpp +++ b/common/cuda_hip/components/uninitialized_array.hpp @@ -82,4 +82,4 @@ class uninitialized_array { } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_INC_ \ No newline at end of file +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_INC_ diff --git a/common/cuda_hip/components/warp_blas.hpp b/common/cuda_hip/components/warp_blas.hpp index 1f25bb61634..cfa46b8a045 100644 --- a/common/cuda_hip/components/warp_blas.hpp +++ b/common/cuda_hip/components/warp_blas.hpp @@ -434,4 +434,4 @@ __device__ __forceinline__ remove_complex compute_infinity_norm( } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_INC_ \ No newline at end of file +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_INC_ diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp index 6b5f997d153..88988febbb0 100644 --- a/common/cuda_hip/distributed/matrix_kernels.cpp +++ b/common/cuda_hip/distributed/matrix_kernels.cpp @@ -201,4 +201,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( } // namespace distributed_matrix } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/distributed/partition_helpers_kernels.cpp b/common/cuda_hip/distributed/partition_helpers_kernels.cpp index cd1419230d2..e733c9217b1 100644 --- a/common/cuda_hip/distributed/partition_helpers_kernels.cpp +++ b/common/cuda_hip/distributed/partition_helpers_kernels.cpp @@ -45,4 +45,4 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( } // namespace partition_helpers } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/distributed/partition_kernels.cpp b/common/cuda_hip/distributed/partition_kernels.cpp index b4e051b97f5..7f623b423fb 100644 --- a/common/cuda_hip/distributed/partition_kernels.cpp +++ b/common/cuda_hip/distributed/partition_kernels.cpp @@ -135,4 +135,4 @@ GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE( } // namespace partition } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/distributed/vector_kernels.cpp b/common/cuda_hip/distributed/vector_kernels.cpp index 91bd838497d..1bacc93489a 100644 --- a/common/cuda_hip/distributed/vector_kernels.cpp +++ b/common/cuda_hip/distributed/vector_kernels.cpp @@ -90,4 +90,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( } // namespace distributed_vector } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp index 6e6be7b81fd..e5f2bf5e5e5 100644 --- a/common/cuda_hip/factorization/cholesky_kernels.cpp +++ b/common/cuda_hip/factorization/cholesky_kernels.cpp @@ -435,4 +435,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace cholesky } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/factorization/factorization_kernels.cpp b/common/cuda_hip/factorization/factorization_kernels.cpp index da2666feb25..3a38175ab70 100644 --- a/common/cuda_hip/factorization/factorization_kernels.cpp +++ b/common/cuda_hip/factorization/factorization_kernels.cpp @@ -555,4 +555,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace factorization } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/factorization/lu_kernels.cpp b/common/cuda_hip/factorization/lu_kernels.cpp index 71d09e93ef7..aa432bf711c 100644 --- a/common/cuda_hip/factorization/lu_kernels.cpp +++ b/common/cuda_hip/factorization/lu_kernels.cpp @@ -341,4 +341,4 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( } // namespace lu_factorization } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/factorization/par_ic_kernels.cpp b/common/cuda_hip/factorization/par_ic_kernels.cpp index 7102d782b94..ee8b7c97f64 100644 --- a/common/cuda_hip/factorization/par_ic_kernels.cpp +++ b/common/cuda_hip/factorization/par_ic_kernels.cpp @@ -142,4 +142,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace par_ic_factorization } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/factorization/par_ilu_kernels.cpp b/common/cuda_hip/factorization/par_ilu_kernels.cpp index 447fdb99c2c..8bf71c471a8 100644 --- a/common/cuda_hip/factorization/par_ilu_kernels.cpp +++ b/common/cuda_hip/factorization/par_ilu_kernels.cpp @@ -115,4 +115,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace par_ilu_factorization } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/matrix/coo_kernels.cpp b/common/cuda_hip/matrix/coo_kernels.cpp index 00ab983bc9f..cffe18d981b 100644 --- a/common/cuda_hip/matrix/coo_kernels.cpp +++ b/common/cuda_hip/matrix/coo_kernels.cpp @@ -345,4 +345,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace coo } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/matrix/dense_kernels.cpp b/common/cuda_hip/matrix/dense_kernels.cpp index b44c0396823..d8391ace023 100644 --- a/common/cuda_hip/matrix/dense_kernels.cpp +++ b/common/cuda_hip/matrix/dense_kernels.cpp @@ -843,4 +843,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); } // namespace dense } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/matrix/diagonal_kernels.cpp b/common/cuda_hip/matrix/diagonal_kernels.cpp index a824abc6f7c..e12d3ed4f9f 100644 --- a/common/cuda_hip/matrix/diagonal_kernels.cpp +++ b/common/cuda_hip/matrix/diagonal_kernels.cpp @@ -88,4 +88,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace diagonal } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/matrix/ell_kernels.cpp b/common/cuda_hip/matrix/ell_kernels.cpp index 40f174a25c7..bfdd3f21e51 100644 --- a/common/cuda_hip/matrix/ell_kernels.cpp +++ b/common/cuda_hip/matrix/ell_kernels.cpp @@ -395,4 +395,4 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( } // namespace ell } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/matrix/fbcsr_kernels.template.cpp b/common/cuda_hip/matrix/fbcsr_kernels.template.cpp index f6276fdd056..960708378e1 100644 --- a/common/cuda_hip/matrix/fbcsr_kernels.template.cpp +++ b/common/cuda_hip/matrix/fbcsr_kernels.template.cpp @@ -635,4 +635,4 @@ void conj_transpose(std::shared_ptr exec, } // namespace fbcsr } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/matrix/sellp_kernels.cpp b/common/cuda_hip/matrix/sellp_kernels.cpp index 64c672b8d8d..3e8fba395b3 100644 --- a/common/cuda_hip/matrix/sellp_kernels.cpp +++ b/common/cuda_hip/matrix/sellp_kernels.cpp @@ -138,4 +138,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace sellp } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/matrix/sparsity_csr_kernels.cpp b/common/cuda_hip/matrix/sparsity_csr_kernels.cpp index 067b2749097..269708e19ae 100644 --- a/common/cuda_hip/matrix/sparsity_csr_kernels.cpp +++ b/common/cuda_hip/matrix/sparsity_csr_kernels.cpp @@ -327,4 +327,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace sparsity_csr } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/multigrid/pgm_kernels.cpp b/common/cuda_hip/multigrid/pgm_kernels.cpp index a2c5d608a50..d3c44cf540e 100644 --- a/common/cuda_hip/multigrid/pgm_kernels.cpp +++ b/common/cuda_hip/multigrid/pgm_kernels.cpp @@ -85,4 +85,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace pgm } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/preconditioner/isai_kernels.cpp b/common/cuda_hip/preconditioner/isai_kernels.cpp index eda1f9a0661..d6fdd6389fc 100644 --- a/common/cuda_hip/preconditioner/isai_kernels.cpp +++ b/common/cuda_hip/preconditioner/isai_kernels.cpp @@ -600,4 +600,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace isai } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_kernels.cpp index 3c581546be2..f614070f65e 100644 --- a/common/cuda_hip/preconditioner/jacobi_kernels.cpp +++ b/common/cuda_hip/preconditioner/jacobi_kernels.cpp @@ -412,4 +412,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace jacobi } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/reorder/rcm_kernels.cpp b/common/cuda_hip/reorder/rcm_kernels.cpp index 380ef69fac8..3206fb28c8b 100644 --- a/common/cuda_hip/reorder/rcm_kernels.cpp +++ b/common/cuda_hip/reorder/rcm_kernels.cpp @@ -658,4 +658,4 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_RCM_COMPUTE_PERMUTATION_KERNEL); } // namespace rcm } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/solver/cb_gmres_kernels.cpp b/common/cuda_hip/solver/cb_gmres_kernels.cpp index 59c9812dc65..02d45a8d31e 100644 --- a/common/cuda_hip/solver/cb_gmres_kernels.cpp +++ b/common/cuda_hip/solver/cb_gmres_kernels.cpp @@ -1049,4 +1049,4 @@ GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE( } // namespace cb_gmres } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/solver/idr_kernels.cpp b/common/cuda_hip/solver/idr_kernels.cpp index 63c5f015f68..a0f605134eb 100644 --- a/common/cuda_hip/solver/idr_kernels.cpp +++ b/common/cuda_hip/solver/idr_kernels.cpp @@ -650,4 +650,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); } // namespace idr } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/common/cuda_hip/solver/multigrid_kernels.cpp b/common/cuda_hip/solver/multigrid_kernels.cpp index 61b6ee44836..b9e411bd5f8 100644 --- a/common/cuda_hip/solver/multigrid_kernels.cpp +++ b/common/cuda_hip/solver/multigrid_kernels.cpp @@ -204,4 +204,4 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE( } // namespace multigrid } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko From aa0f23f37f79854a66bab23cae7a95fbd0bc3a95 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 29 Jun 2024 20:06:24 +0200 Subject: [PATCH 046/448] unify index_map --- common/cuda_hip/CMakeLists.txt | 1 + ..._kernels.hpp.inc => index_map_kernels.cpp} | 34 ++++++++++++++++ cuda/CMakeLists.txt | 1 - cuda/distributed/index_map_kernels.cu | 39 ------------------- hip/CMakeLists.txt | 1 - hip/distributed/index_map_kernels.hip.cpp | 39 ------------------- 6 files changed, 35 insertions(+), 80 deletions(-) rename common/cuda_hip/distributed/{index_map_kernels.hpp.inc => index_map_kernels.cpp} (92%) delete mode 100644 cuda/distributed/index_map_kernels.cu delete mode 100644 hip/distributed/index_map_kernels.hip.cpp diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index 0225e3ad872..af6a8c24503 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -2,6 +2,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) set(CUDA_HIP_SOURCES base/device_matrix_data_kernels.cpp components/prefix_sum_kernels.cpp + distributed/index_map_kernels.cpp distributed/matrix_kernels.cpp distributed/partition_helpers_kernels.cpp distributed/partition_kernels.cpp diff --git a/common/cuda_hip/distributed/index_map_kernels.hpp.inc b/common/cuda_hip/distributed/index_map_kernels.cpp similarity index 92% rename from common/cuda_hip/distributed/index_map_kernels.hpp.inc rename to common/cuda_hip/distributed/index_map_kernels.cpp index 9d312cc43aa..744d0f5581f 100644 --- a/common/cuda_hip/distributed/index_map_kernels.hpp.inc +++ b/common/cuda_hip/distributed/index_map_kernels.cpp @@ -2,6 +2,34 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/distributed/index_map_kernels.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/searching.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace index_map { + + /** * This struct is necessary, since the `transform_output_iterator` seemingly * doesn't support non-copyable tranfsorm function (this excludes lambdas) @@ -266,3 +294,9 @@ void map_to_local( GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_INDEX_MAP_MAP_TO_LOCAL); + + +} // namespace index_map +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 1552f4f3ee5..a068eb727b7 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -18,7 +18,6 @@ target_sources(ginkgo_cuda base/stream.cpp base/timer.cpp base/version.cpp - distributed/index_map_kernels.cu factorization/ic_kernels.cu factorization/ilu_kernels.cu factorization/par_ict_kernels.cu diff --git a/cuda/distributed/index_map_kernels.cu b/cuda/distributed/index_map_kernels.cu deleted file mode 100644 index 3c23d098a0e..00000000000 --- a/cuda/distributed/index_map_kernels.cu +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/index_map_kernels.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/atomic.hpp" -#include "common/cuda_hip/components/searching.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace index_map { - - -#include "common/cuda_hip/distributed/index_map_kernels.hpp.inc" - - -} // namespace index_map -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 71d41ad47df..d83e5e28d21 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -16,7 +16,6 @@ set(GINKGO_HIP_SOURCES base/stream.hip.cpp base/timer.hip.cpp base/version.hip.cpp - distributed/index_map_kernels.hip.cpp factorization/ic_kernels.hip.cpp factorization/ilu_kernels.hip.cpp factorization/par_ict_kernels.hip.cpp diff --git a/hip/distributed/index_map_kernels.hip.cpp b/hip/distributed/index_map_kernels.hip.cpp deleted file mode 100644 index 67ff2f72857..00000000000 --- a/hip/distributed/index_map_kernels.hip.cpp +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/index_map_kernels.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/atomic.hpp" -#include "common/cuda_hip/components/searching.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace index_map { - - -#include "common/cuda_hip/distributed/index_map_kernels.hpp.inc" - - -} // namespace index_map -} // namespace hip -} // namespace kernels -} // namespace gko From 6e7c0964ffe003da0f72a347f90348cde7af786e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 29 Jun 2024 20:12:13 +0200 Subject: [PATCH 047/448] unify stopping criteria --- common/cuda_hip/CMakeLists.txt | 2 + .../cuda_hip/stop/criterion_kernels.cpp | 7 +- .../cuda_hip/stop/residual_norm_kernels.cpp | 8 +- cuda/CMakeLists.txt | 2 - cuda/stop/residual_norm_kernels.cu | 179 ------------------ hip/CMakeLists.txt | 2 - hip/stop/criterion_kernels.hip.cpp | 58 ------ 7 files changed, 10 insertions(+), 248 deletions(-) rename cuda/stop/criterion_kernels.cu => common/cuda_hip/stop/criterion_kernels.cpp (89%) rename hip/stop/residual_norm_kernels.hip.cpp => common/cuda_hip/stop/residual_norm_kernels.cpp (96%) delete mode 100644 cuda/stop/residual_norm_kernels.cu delete mode 100644 hip/stop/criterion_kernels.hip.cpp diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index af6a8c24503..c18755ab164 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -25,6 +25,8 @@ set(CUDA_HIP_SOURCES solver/cb_gmres_kernels.cpp solver/idr_kernels.cpp solver/multigrid_kernels.cpp + stop/criterion_kernels.cpp + stop/residual_norm_kernels.cpp ) list(TRANSFORM CUDA_HIP_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/) set(GKO_CUDA_HIP_COMMON_SOURCES ${CUDA_HIP_SOURCES} PARENT_SCOPE) diff --git a/cuda/stop/criterion_kernels.cu b/common/cuda_hip/stop/criterion_kernels.cpp similarity index 89% rename from cuda/stop/criterion_kernels.cu rename to common/cuda_hip/stop/criterion_kernels.cpp index fa596f0c03f..8e3a69f725e 100644 --- a/cuda/stop/criterion_kernels.cu +++ b/common/cuda_hip/stop/criterion_kernels.cpp @@ -9,13 +9,14 @@ #include #include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/thread_ids.hpp" namespace gko { namespace kernels { -namespace cuda { +namespace GKO_DEVICE_NAMESPACE { /** * @brief The Set all statuses namespace. * @ref set_status @@ -38,7 +39,7 @@ __global__ __launch_bounds__(default_block_size) void set_all_statuses( } -void set_all_statuses(std::shared_ptr exec, +void set_all_statuses(std::shared_ptr exec, uint8 stoppingId, bool setFinalized, array* stop_status) { @@ -54,6 +55,6 @@ void set_all_statuses(std::shared_ptr exec, } // namespace set_all_statuses -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/common/cuda_hip/stop/residual_norm_kernels.cpp similarity index 96% rename from hip/stop/residual_norm_kernels.hip.cpp rename to common/cuda_hip/stop/residual_norm_kernels.cpp index 0a9af423128..9d6db5211e8 100644 --- a/hip/stop/residual_norm_kernels.hip.cpp +++ b/common/cuda_hip/stop/residual_norm_kernels.cpp @@ -17,7 +17,7 @@ namespace gko { namespace kernels { -namespace hip { +namespace GKO_DEVICE_NAMESPACE { /** * @brief The Residual norm stopping criterion namespace. * @ref resnorm @@ -61,7 +61,7 @@ __global__ __launch_bounds__(1) void init_kernel( template -void residual_norm(std::shared_ptr exec, +void residual_norm(std::shared_ptr exec, const matrix::Dense* tau, const matrix::Dense* orig_tau, ValueType rel_residual_goal, uint8 stoppingId, @@ -143,7 +143,7 @@ __global__ __launch_bounds__(1) void init_kernel( template void implicit_residual_norm( - std::shared_ptr exec, + std::shared_ptr exec, const matrix::Dense* tau, const matrix::Dense>* orig_tau, remove_complex rel_residual_goal, uint8 stoppingId, @@ -175,6 +175,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL); } // namespace implicit_residual_norm -} // namespace hip +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index a068eb727b7..b44fe665153 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -41,8 +41,6 @@ target_sources(ginkgo_cuda solver/batch_cg_kernels.cu solver/lower_trs_kernels.cu solver/upper_trs_kernels.cu - stop/criterion_kernels.cu - stop/residual_norm_kernels.cu ${GKO_UNIFIED_COMMON_SOURCES} ${GKO_CUDA_HIP_COMMON_SOURCES} ) diff --git a/cuda/stop/residual_norm_kernels.cu b/cuda/stop/residual_norm_kernels.cu deleted file mode 100644 index e52a74cf422..00000000000 --- a/cuda/stop/residual_norm_kernels.cu +++ /dev/null @@ -1,179 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/stop/residual_norm_kernels.hpp" - -#include -#include -#include - -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "core/base/array_access.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Residual norm stopping criterion namespace. - * @ref resnorm - * @ingroup resnorm - */ -namespace residual_norm { - - -constexpr int default_block_size = 512; - - -template -__global__ __launch_bounds__(default_block_size) void residual_norm_kernel( - size_type num_cols, ValueType rel_residual_goal, - const ValueType* __restrict__ tau, const ValueType* __restrict__ orig_tau, - uint8 stoppingId, bool setFinalized, - stopping_status* __restrict__ stop_status, - bool* __restrict__ device_storage) -{ - const auto tidx = thread::get_thread_id_flat(); - if (tidx < num_cols) { - if (tau[tidx] <= rel_residual_goal * orig_tau[tidx]) { - stop_status[tidx].converge(stoppingId, setFinalized); - device_storage[1] = true; - } - // because only false is written to all_converged, write conflicts - // should not cause any problem - else if (!stop_status[tidx].has_stopped()) { - device_storage[0] = false; - } - } -} - - -__global__ __launch_bounds__(1) void init_kernel( - bool* __restrict__ device_storage) -{ - device_storage[0] = true; - device_storage[1] = false; -} - - -template -void residual_norm(std::shared_ptr exec, - const matrix::Dense* tau, - const matrix::Dense* orig_tau, - ValueType rel_residual_goal, uint8 stoppingId, - bool setFinalized, array* stop_status, - array* device_storage, bool* all_converged, - bool* one_changed) -{ - static_assert(is_complex_s::value == false, - "ValueType must not be complex in this function!"); - init_kernel<<<1, 1, 0, exec->get_stream()>>>( - as_device_type(device_storage->get_data())); - - const auto block_size = default_block_size; - const auto grid_size = ceildiv(tau->get_size()[1], block_size); - - if (grid_size > 0) { - residual_norm_kernel<<get_stream()>>>( - tau->get_size()[1], as_device_type(rel_residual_goal), - as_device_type(tau->get_const_values()), - as_device_type(orig_tau->get_const_values()), stoppingId, - setFinalized, as_device_type(stop_status->get_data()), - as_device_type(device_storage->get_data())); - } - - /* Represents all_converged, one_changed */ - *all_converged = get_element(*device_storage, 0); - *one_changed = get_element(*device_storage, 1); -} - -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE( - GKO_DECLARE_RESIDUAL_NORM_KERNEL); - - -} // namespace residual_norm - - -/** - * @brief The Implicit Residual norm stopping criterion. - * @ref implicit_resnorm - * @ingroup resnorm - */ -namespace implicit_residual_norm { - - -constexpr int default_block_size = 512; - - -template -__global__ -__launch_bounds__(default_block_size) void implicit_residual_norm_kernel( - size_type num_cols, remove_complex rel_residual_goal, - const ValueType* __restrict__ tau, - const remove_complex* __restrict__ orig_tau, uint8 stoppingId, - bool setFinalized, stopping_status* __restrict__ stop_status, - bool* __restrict__ device_storage) -{ - const auto tidx = thread::get_thread_id_flat(); - if (tidx < num_cols) { - if (sqrt(abs(tau[tidx])) <= rel_residual_goal * orig_tau[tidx]) { - stop_status[tidx].converge(stoppingId, setFinalized); - device_storage[1] = true; - } - // because only false is written to all_converged, write conflicts - // should not cause any problem - else if (!stop_status[tidx].has_stopped()) { - device_storage[0] = false; - } - } -} - - -__global__ __launch_bounds__(1) void init_kernel( - bool* __restrict__ device_storage) -{ - device_storage[0] = true; - device_storage[1] = false; -} - - -template -void implicit_residual_norm( - std::shared_ptr exec, - const matrix::Dense* tau, - const matrix::Dense>* orig_tau, - remove_complex rel_residual_goal, uint8 stoppingId, - bool setFinalized, array* stop_status, - array* device_storage, bool* all_converged, bool* one_changed) -{ - init_kernel<<<1, 1, 0, exec->get_stream()>>>( - as_device_type(device_storage->get_data())); - - const auto block_size = default_block_size; - const auto grid_size = ceildiv(tau->get_size()[1], block_size); - - if (grid_size > 0) { - implicit_residual_norm_kernel<<get_stream()>>>( - tau->get_size()[1], as_device_type(rel_residual_goal), - as_device_type(tau->get_const_values()), - as_device_type(orig_tau->get_const_values()), stoppingId, - setFinalized, as_device_type(stop_status->get_data()), - as_device_type(device_storage->get_data())); - } - - /* Represents all_converged, one_changed */ - *all_converged = get_element(*device_storage, 0); - *one_changed = get_element(*device_storage, 1); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IMPLICIT_RESIDUAL_NORM_KERNEL); - - -} // namespace implicit_residual_norm -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index d83e5e28d21..abc3d6b5bcf 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -38,8 +38,6 @@ set(GINKGO_HIP_SOURCES solver/batch_cg_kernels.hip.cpp solver/lower_trs_kernels.hip.cpp solver/upper_trs_kernels.hip.cpp - stop/criterion_kernels.hip.cpp - stop/residual_norm_kernels.hip.cpp ${GKO_UNIFIED_COMMON_SOURCES} ${GKO_CUDA_HIP_COMMON_SOURCES} ) diff --git a/hip/stop/criterion_kernels.hip.cpp b/hip/stop/criterion_kernels.hip.cpp deleted file mode 100644 index 0b8e300f978..00000000000 --- a/hip/stop/criterion_kernels.hip.cpp +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/stop/criterion_kernels.hpp" - -#include -#include -#include - -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Set all statuses namespace. - * @ref set_status - * @ingroup set_all_statuses - */ -namespace set_all_statuses { - - -constexpr int default_block_size = 512; - - -__global__ __launch_bounds__(default_block_size) void set_all_statuses( - size_type num_elems, uint8 stoppingId, bool setFinalized, - stopping_status* stop_status) -{ - const auto tidx = thread::get_thread_id_flat(); - if (tidx < num_elems) { - stop_status[tidx].stop(stoppingId, setFinalized); - } -} - - -void set_all_statuses(std::shared_ptr exec, uint8 stoppingId, - bool setFinalized, array* stop_status) -{ - const auto block_size = default_block_size; - const auto grid_size = ceildiv(stop_status->get_size(), block_size); - - if (grid_size > 0) { - set_all_statuses<<get_stream()>>>( - stop_status->get_size(), stoppingId, setFinalized, - as_device_type(stop_status->get_data())); - } -} - - -} // namespace set_all_statuses -} // namespace hip -} // namespace kernels -} // namespace gko From b618a7e3e87e0e6d98269f9cb6aa7bc68ed407a1 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 1 Jul 2024 23:44:06 +0200 Subject: [PATCH 048/448] fix include guard naming --- common/cuda_hip/base/math.hpp | 6 +++--- common/cuda_hip/components/atomic.hpp | 6 +++--- common/cuda_hip/components/diagonal_block_manipulation.hpp | 6 +++--- common/cuda_hip/components/intrinsics.hpp | 6 +++--- common/cuda_hip/components/merging.hpp | 6 +++--- common/cuda_hip/components/prefix_sum.hpp | 6 +++--- common/cuda_hip/components/reduction.hpp | 6 +++--- common/cuda_hip/components/searching.hpp | 6 +++--- common/cuda_hip/components/segment_scan.hpp | 6 +++--- common/cuda_hip/components/sorting.hpp | 6 +++--- common/cuda_hip/components/syncfree.hpp | 6 +++--- common/cuda_hip/components/thread_ids.hpp | 6 +++--- common/cuda_hip/components/uninitialized_array.hpp | 6 +++--- common/cuda_hip/components/warp_blas.hpp | 6 +++--- 14 files changed, 42 insertions(+), 42 deletions(-) diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp index ea11c7d73a9..ee8612a691a 100644 --- a/common/cuda_hip/base/math.hpp +++ b/common/cuda_hip/base/math.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_INC_ -#define GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_INC_ +#ifndef GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_ #include @@ -54,4 +54,4 @@ struct truncate_type_impl> { } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_INC_ +#endif // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_ diff --git a/common/cuda_hip/components/atomic.hpp b/common/cuda_hip/components/atomic.hpp index 3279c9433f1..cb9e5b00e67 100644 --- a/common/cuda_hip/components/atomic.hpp +++ b/common/cuda_hip/components/atomic.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_INC_ -#define GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_INC_ +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_ #include @@ -250,4 +250,4 @@ __forceinline__ __device__ thrust::complex atomic_add( } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_INC_ +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_ diff --git a/common/cuda_hip/components/diagonal_block_manipulation.hpp b/common/cuda_hip/components/diagonal_block_manipulation.hpp index 890d080018e..e00e11f1eea 100644 --- a/common/cuda_hip/components/diagonal_block_manipulation.hpp +++ b/common/cuda_hip/components/diagonal_block_manipulation.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_INC_ -#define GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_INC_ +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_ #include @@ -88,4 +88,4 @@ __device__ __forceinline__ void extract_transposed_diag_blocks( } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_INC_ +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_ diff --git a/common/cuda_hip/components/intrinsics.hpp b/common/cuda_hip/components/intrinsics.hpp index e8c236e22b1..df3b5ad4c7f 100644 --- a/common/cuda_hip/components/intrinsics.hpp +++ b/common/cuda_hip/components/intrinsics.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_INC_ -#define GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_INC_ +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_ #include @@ -55,4 +55,4 @@ __forceinline__ __device__ int clz(uint64 mask) { return __clzll(mask); } } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_INC_ +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_ diff --git a/common/cuda_hip/components/merging.hpp b/common/cuda_hip/components/merging.hpp index 4c1bfa4cd2d..ab070741fbd 100644 --- a/common/cuda_hip/components/merging.hpp +++ b/common/cuda_hip/components/merging.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_INC_ -#define GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_INC_ +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_ #include "common/cuda_hip/base/math.hpp" @@ -302,4 +302,4 @@ __forceinline__ __device__ void sequential_match(const ValueType* a, } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_INC_ +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_ diff --git a/common/cuda_hip/components/prefix_sum.hpp b/common/cuda_hip/components/prefix_sum.hpp index a09eb8f17c5..ceed6b89a93 100644 --- a/common/cuda_hip/components/prefix_sum.hpp +++ b/common/cuda_hip/components/prefix_sum.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_INC_ -#define GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_INC_ +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_ #include @@ -182,4 +182,4 @@ __global__ __launch_bounds__(block_size) void finalize_prefix_sum( } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_INC_ +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_ diff --git a/common/cuda_hip/components/reduction.hpp b/common/cuda_hip/components/reduction.hpp index 582de3de1fb..fd9d34ed73c 100644 --- a/common/cuda_hip/components/reduction.hpp +++ b/common/cuda_hip/components/reduction.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_INC_ -#define GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_INC_ +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_ #include @@ -296,4 +296,4 @@ __host__ ValueType reduce_add_array(std::shared_ptr exec, } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_INC_ +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_ diff --git a/common/cuda_hip/components/searching.hpp b/common/cuda_hip/components/searching.hpp index 61efde54197..cb219c58b0b 100644 --- a/common/cuda_hip/components/searching.hpp +++ b/common/cuda_hip/components/searching.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_INC_ -#define GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_INC_ +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_ #include "common/cuda_hip/base/config.hpp" @@ -228,4 +228,4 @@ __forceinline__ __device__ IndexType group_ary_search(IndexType offset, } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_INC_ +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_ diff --git a/common/cuda_hip/components/segment_scan.hpp b/common/cuda_hip/components/segment_scan.hpp index af3953a4176..0ab34fd093b 100644 --- a/common/cuda_hip/components/segment_scan.hpp +++ b/common/cuda_hip/components/segment_scan.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_INC_ -#define GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_INC_ +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_ #include "common/cuda_hip/components/cooperative_groups.hpp" @@ -52,4 +52,4 @@ __device__ __forceinline__ bool segment_scan( } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_INC_ +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_ diff --git a/common/cuda_hip/components/sorting.hpp b/common/cuda_hip/components/sorting.hpp index b3ce253b451..7603d41a8ba 100644 --- a/common/cuda_hip/components/sorting.hpp +++ b/common/cuda_hip/components/sorting.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_INC_ -#define GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_INC_ +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_ #include "common/cuda_hip/base/config.hpp" @@ -311,4 +311,4 @@ __forceinline__ __device__ void bitonic_sort(ValueType* local_elements, } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_INC_ +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_ diff --git a/common/cuda_hip/components/syncfree.hpp b/common/cuda_hip/components/syncfree.hpp index e1693fe4e4d..f2fb82366a2 100644 --- a/common/cuda_hip/components/syncfree.hpp +++ b/common/cuda_hip/components/syncfree.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_INC_ -#define GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_INC_ +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_ #include @@ -135,4 +135,4 @@ class syncfree_scheduler { } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_INC_ +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_ diff --git a/common/cuda_hip/components/thread_ids.hpp b/common/cuda_hip/components/thread_ids.hpp index 7d7c5e2bda3..e73296f92a9 100644 --- a/common/cuda_hip/components/thread_ids.hpp +++ b/common/cuda_hip/components/thread_ids.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_INC_ -#define GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_INC_ +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_ #include "common/cuda_hip/base/config.hpp" @@ -263,4 +263,4 @@ __device__ __forceinline__ IndexType get_subwarp_num_flat() } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_INC_ +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_ diff --git a/common/cuda_hip/components/uninitialized_array.hpp b/common/cuda_hip/components/uninitialized_array.hpp index d4a2b5939af..44fcbfd0d85 100644 --- a/common/cuda_hip/components/uninitialized_array.hpp +++ b/common/cuda_hip/components/uninitialized_array.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_INC_ -#define GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_INC_ +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ #include @@ -82,4 +82,4 @@ class uninitialized_array { } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_INC_ +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ diff --git a/common/cuda_hip/components/warp_blas.hpp b/common/cuda_hip/components/warp_blas.hpp index cfa46b8a045..116b963ad11 100644 --- a/common/cuda_hip/components/warp_blas.hpp +++ b/common/cuda_hip/components/warp_blas.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_INC_ -#define GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_INC_ +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_ #include @@ -434,4 +434,4 @@ __device__ __forceinline__ remove_complex compute_infinity_norm( } // namespace gko -#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_INC_ +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_ From d16dd18d45d41a43c5b738455b377a1e9a0601ab Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 1 Jul 2024 23:44:33 +0200 Subject: [PATCH 049/448] fix formatting Co-authored-by: Yuhsiang M. Tsai --- common/cuda_hip/base/math.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp index ee8612a691a..8c655174524 100644 --- a/common/cuda_hip/base/math.hpp +++ b/common/cuda_hip/base/math.hpp @@ -49,8 +49,6 @@ struct truncate_type_impl> { } // namespace detail - - } // namespace gko From 6762a902939db8ee853168dfb23e58ee4e70f4a7 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 2 Jul 2024 15:23:41 +0200 Subject: [PATCH 050/448] add unification script --- dev_tools/scripts/unify_cuda_hip.py | 135 ++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 dev_tools/scripts/unify_cuda_hip.py diff --git a/dev_tools/scripts/unify_cuda_hip.py b/dev_tools/scripts/unify_cuda_hip.py new file mode 100644 index 00000000000..e359a69d1ff --- /dev/null +++ b/dev_tools/scripts/unify_cuda_hip.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +import sys +import os +import difflib +import subprocess + +common_filename = sys.argv[1] +base_filename = common_filename.replace("common/cuda_hip/", "").replace(".hpp.inc", "") +cuda_filename = next( + f"cuda/{base_filename}{extension}" + for extension in [".cu", ".cuh", ".cpp", ".hpp", ".template.cu"] + if os.path.exists(f"cuda/{base_filename}{extension}") +) +hip_filename = next( + f"hip/{base_filename}{extension}" + for extension in [".hip.cpp", ".hip.hpp", ".template.hip.cpp"] + if os.path.exists(f"hip/{base_filename}{extension}") +) +output_filename = f"common/cuda_hip/{base_filename}{'.cpp' if cuda_filename.endswith('.cu') else '.hpp'}" + +common_lines = list(open(common_filename))[3:] # remove license header +cuda_lines = list(open(cuda_filename)) +hip_lines = list(open(hip_filename)) + +cuda_file_guard = f"GKO_{cuda_filename.upper().replace('/', '_').replace('.','_')}_" +hip_file_guard = f"GKO_{hip_filename.upper().replace('/', '_').replace('.','_')}_" +common_file_guard = f"GKO_{common_filename.upper().replace('/', '_').replace('.','_')}_" + +cuda_lines = [ + line.replace(cuda_file_guard, common_file_guard) + .replace("namespace cuda", "namespace GKO_DEVICE_NAMESPACE") + .replace("CudaExecutor", "DefaultExecutor") + for line in cuda_lines +] +hip_lines = [ + line.replace(hip_file_guard, common_file_guard) + .replace("namespace hip", "namespace GKO_DEVICE_NAMESPACE") + .replace("HipExecutor", "DefaultExecutor") + for line in hip_lines +] + +for i in range(len(cuda_lines)): + if cuda_lines[i].startswith('#include "'): + cuda_lines[i] = ( + cuda_lines[i] + .replace('#include "cuda/', '#include "common/cuda_hip/') + .replace(".cuh", ".hpp") + .replace("cublas", "blas") + .replace("cusparse", "sparselib") + .replace("curand", "randlib") + ) + cuda_lines[i] = ( + cuda_lines[i] + .replace("cuda_range", "device_range") + .replace("cuda::", "GKO_DEVICE_NAMESPACE::") + ) +for i in range(len(hip_lines)): + if hip_lines[i].startswith('#include "'): + hip_lines[i] = ( + hip_lines[i] + .replace('#include "hip/', '#include "common/cuda_hip/') + .replace(".hip.hpp", ".hpp") + .replace("hipblas", "blas") + .replace("hipsparse", "sparselib") + .replace("hiprand", "randlib") + ) + hip_lines[i] = ( + hip_lines[i] + .replace("hip_range", "device_range") + .replace("hip::", "GKO_DEVICE_NAMESPACE::") + ) + +cuda_location = next( + i + for i, line in enumerate(cuda_lines) + if line.startswith(f'#include "{common_filename}"') +) +hip_location = next( + i + for i, line in enumerate(hip_lines) + if line.startswith(f'#include "{common_filename}"') +) +cuda_replaced = ( + cuda_lines[:cuda_location] + common_lines + cuda_lines[cuda_location + 1 :] +) +hip_replaced = hip_lines[:hip_location] + common_lines + hip_lines[hip_location + 1 :] + +cuda_replaced = ( + subprocess.run( + args=[ + "/home/tribizel/.cache/pre-commit/repoay30okq9/py_env-python3/lib64/python3.9/site-packages/clang_format/data/bin/clang-format", + f"-assume-filename={output_filename}", + "-", + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + input=bytes("".join(cuda_replaced), "utf-8"), + ) + .stdout.decode() + .splitlines() +) +hip_replaced = ( + subprocess.run( + args=[ + "/home/tribizel/.cache/pre-commit/repoay30okq9/py_env-python3/lib64/python3.9/site-packages/clang_format/data/bin/clang-format", + f"-assume-filename={output_filename}", + "-", + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + input=bytes("".join(hip_replaced), "utf-8"), + ) + .stdout.decode() + .splitlines() +) + +if cuda_replaced == hip_replaced: + with open(output_filename, "w") as file: + file.write("\n".join(cuda_replaced)) + os.remove(common_filename) + os.remove(cuda_filename) + os.remove(hip_filename) + with open("cuda_source_delete.sed", "a") as file: + file.write("/" + cuda_filename[5:].replace("/", "\\/") + "/d;") + with open("hip_source_delete.sed", "a") as file: + file.write("/" + hip_filename[4:].replace("/", "\\/") + "/d;") + with open("source_add.cmake", "a") as file: + file.write(f"{output_filename}\n") + sys.exit(0) +else: + print(common_filename) + print(cuda_filename) + print(hip_filename) + print("\n".join(difflib.unified_diff(cuda_replaced, hip_replaced))) + sys.exit(1) From c8c7051c2e3c224532a7d87d76575340d8ca2bf7 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 4 Jul 2024 15:51:47 +0200 Subject: [PATCH 051/448] mirror cuda_hip in generated headers --- common/cuda_hip/preconditioner/jacobi_kernels.cpp | 3 ++- cuda/CMakeLists.txt | 2 +- cuda/preconditioner/batch_jacobi_kernels.cu | 3 ++- cuda/preconditioner/jacobi_advanced_apply_kernels.cu | 3 ++- .../jacobi_advanced_apply_kernels.instantiate.cu | 3 ++- cuda/preconditioner/jacobi_generate_kernels.cu | 3 ++- cuda/preconditioner/jacobi_generate_kernels.instantiate.cu | 3 ++- cuda/preconditioner/jacobi_simple_apply_kernels.cu | 3 ++- cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu | 3 ++- hip/CMakeLists.txt | 2 +- hip/preconditioner/batch_jacobi_kernels.hip.cpp | 3 ++- hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp | 3 ++- .../jacobi_advanced_apply_kernels.instantiate.hip.cpp | 3 ++- hip/preconditioner/jacobi_generate_kernels.hip.cpp | 3 ++- hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp | 3 ++- hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp | 3 ++- .../jacobi_simple_apply_kernels.instantiate.hip.cpp | 3 ++- 17 files changed, 32 insertions(+), 17 deletions(-) diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_kernels.cpp index f614070f65e..8cf5ad1e9fd 100644 --- a/common/cuda_hip/preconditioner/jacobi_kernels.cpp +++ b/common/cuda_hip/preconditioner/jacobi_kernels.cpp @@ -15,7 +15,8 @@ #include "core/base/extended_float.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "preconditioner/jacobi_common.hpp" +// generated header +#include "common/cuda_hip/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index b44fe665153..30b3f2747e6 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -74,7 +74,7 @@ foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_CUDA_JACOBI_BLOCK_SIZES) endforeach() target_sources(ginkgo_cuda PRIVATE ${GKO_CUDA_JACOBI_SOURCES}) string(REPLACE ";" "," GKO_CUDA_JACOBI_BLOCK_SIZES_CODE "${GKO_CUDA_JACOBI_BLOCK_SIZES}") -configure_file(preconditioner/jacobi_common.hpp.in preconditioner/jacobi_common.hpp) +configure_file(preconditioner/jacobi_common.hpp.in common/cuda_hip/preconditioner/jacobi_common.hpp) if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") # remove false positive CUDA warnings when calling one() and zero() diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu index 178b53d04ea..716c158ffff 100644 --- a/cuda/preconditioner/batch_jacobi_kernels.cu +++ b/cuda/preconditioner/batch_jacobi_kernels.cu @@ -21,7 +21,8 @@ #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" #include "cuda/matrix/batch_struct.hpp" -#include "preconditioner/jacobi_common.hpp" +// generated header +#include "common/cuda_hip/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.cu index fca6b24ba05..a37296abf40 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_kernels.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.cu @@ -7,7 +7,8 @@ #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "preconditioner/jacobi_common.hpp" +// generated header +#include "common/cuda_hip/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu index 80c3b5e1e73..fcf238d038f 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu @@ -15,7 +15,8 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "preconditioner/jacobi_common.hpp" +// generated header +#include "common/cuda_hip/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/cuda/preconditioner/jacobi_generate_kernels.cu b/cuda/preconditioner/jacobi_generate_kernels.cu index e558594f5ce..d51f1947b7a 100644 --- a/cuda/preconditioner/jacobi_generate_kernels.cu +++ b/cuda/preconditioner/jacobi_generate_kernels.cu @@ -8,7 +8,8 @@ #include "core/components/fill_array_kernels.hpp" #include "core/preconditioner/jacobi_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "preconditioner/jacobi_common.hpp" +// generated header +#include "common/cuda_hip/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu index 0dc21311af9..aa8807728a8 100644 --- a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu +++ b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu @@ -18,7 +18,8 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "preconditioner/jacobi_common.hpp" +// generated header +#include "common/cuda_hip/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.cu index 0bb09b1064a..62e49d30618 100644 --- a/cuda/preconditioner/jacobi_simple_apply_kernels.cu +++ b/cuda/preconditioner/jacobi_simple_apply_kernels.cu @@ -7,7 +7,8 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "preconditioner/jacobi_common.hpp" +// generated header +#include "common/cuda_hip/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu index 0721c03126b..d51b63487fe 100644 --- a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu +++ b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu @@ -15,7 +15,8 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "preconditioner/jacobi_common.hpp" +// generated header +#include "common/cuda_hip/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index abc3d6b5bcf..23584c2742a 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -84,7 +84,7 @@ foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_HIP_JACOBI_BLOCK_SIZES) ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) endforeach() string(REPLACE ";" "," GKO_HIP_JACOBI_BLOCK_SIZES_CODE "${GKO_HIP_JACOBI_BLOCK_SIZES}") -configure_file(preconditioner/jacobi_common.hip.hpp.in preconditioner/jacobi_common.hpp) +configure_file(preconditioner/jacobi_common.hip.hpp.in common/cuda_hip/preconditioner/jacobi_common.hpp) set_source_files_properties(${GINKGO_HIP_SOURCES} PROPERTIES LANGUAGE HIP) add_library(ginkgo_hip $ ${GINKGO_HIP_SOURCES}) diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp index cfef615dcad..e86bc86390a 100644 --- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp +++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp @@ -23,7 +23,8 @@ #include "hip/base/types.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" #include "hip/matrix/batch_struct.hip.hpp" -#include "preconditioner/jacobi_common.hpp" +// generated header +#include "common/cuda_hip/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp index ce260ec1e16..371a10051fc 100644 --- a/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp @@ -7,7 +7,8 @@ #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "preconditioner/jacobi_common.hpp" +// generated header +#include "common/cuda_hip/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp index 9cc4978a1f8..42c542c228b 100644 --- a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp +++ b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp @@ -16,7 +16,8 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "preconditioner/jacobi_common.hpp" +// generated header +#include "common/cuda_hip/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/hip/preconditioner/jacobi_generate_kernels.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.hip.cpp index 673ca8c373e..d295ebb046e 100644 --- a/hip/preconditioner/jacobi_generate_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_generate_kernels.hip.cpp @@ -19,7 +19,8 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "preconditioner/jacobi_common.hpp" +// generated header +#include "common/cuda_hip/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp index a6be610a839..698efe6a858 100644 --- a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp +++ b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp @@ -18,7 +18,8 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "preconditioner/jacobi_common.hpp" +// generated header +#include "common/cuda_hip/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp index 72f2e4fe556..16ca805a42c 100644 --- a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp @@ -16,7 +16,8 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "preconditioner/jacobi_common.hpp" +// generated header +#include "common/cuda_hip/preconditioner/jacobi_common.hpp" namespace gko { diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp index 1ea34bff93f..d666a698b5e 100644 --- a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp @@ -15,7 +15,8 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "preconditioner/jacobi_common.hpp" +// generated header +#include "common/cuda_hip/preconditioner/jacobi_common.hpp" namespace gko { From 65aa14fb75c72042297f70b85935e7b009d5e6d8 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 4 Jul 2024 16:07:32 +0200 Subject: [PATCH 052/448] fix HIP warnings The % in the computation gets misinterpreted by the printf used as a fallback for GKO_ASSERT --- hip/solver/batch_bicgstab_kernels.hip.cpp | 7 +++++-- hip/solver/batch_cg_kernels.hip.cpp | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index 92051a81640..95a49953b3e 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -125,8 +125,9 @@ class kernel_caller { exec_->get_device_id())); const int block_size = get_num_threads_per_block(exec_, mat.num_rows); + bool is_block_size_aligned = block_size % config::warp_size == 0; GKO_ASSERT(block_size >= 2 * config::warp_size); - GKO_ASSERT(block_size % config::warp_size == 0); + GKO_ASSERT(is_block_size_aligned); // Returns amount required in bytes const size_t prec_size = PrecType::dynamic_work_size( @@ -142,7 +143,9 @@ class kernel_caller { auto workspace = gko::array( exec_, sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); - GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0); + bool is_stride_aligned = + sconf.gmem_stride_bytes % sizeof(value_type) == 0; + GKO_ASSERT(is_stride_aligned); value_type* const workspace_data = workspace.get_data(); diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index 2df02a6f0a8..6102749b988 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -125,8 +125,9 @@ class kernel_caller { exec_->get_device_id())); const int block_size = get_num_threads_per_block(exec_, mat.num_rows); + bool is_block_size_aligned = block_size % config::warp_size == 0; GKO_ASSERT(block_size >= 2 * config::warp_size); - GKO_ASSERT(block_size % config::warp_size == 0); + GKO_ASSERT(is_block_size_aligned); // Returns amount required in bytes const size_t prec_size = PrecType::dynamic_work_size( @@ -142,7 +143,9 @@ class kernel_caller { auto workspace = gko::array( exec_, sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); - GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0); + bool is_stride_aligned = + sconf.gmem_stride_bytes % sizeof(value_type) == 0; + GKO_ASSERT(is_stride_aligned); value_type* const workspace_data = workspace.get_data(); From 674c54a3c83dca988974e37c42ac25ee25e7bede Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 4 Jul 2024 16:10:09 +0200 Subject: [PATCH 053/448] review updates Co-authored-by: Yu-Hsiang M. Tsai --- common/cuda_hip/components/atomic.hpp | 2 -- common/cuda_hip/components/reduction.hpp | 4 ++-- common/cuda_hip/distributed/index_map_kernels.cpp | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/common/cuda_hip/components/atomic.hpp b/common/cuda_hip/components/atomic.hpp index cb9e5b00e67..2fbb1664165 100644 --- a/common/cuda_hip/components/atomic.hpp +++ b/common/cuda_hip/components/atomic.hpp @@ -15,8 +15,6 @@ namespace gko { namespace kernels { namespace GKO_DEVICE_NAMESPACE { - - namespace detail { diff --git a/common/cuda_hip/components/reduction.hpp b/common/cuda_hip/components/reduction.hpp index fd9d34ed73c..1968a6d30b6 100644 --- a/common/cuda_hip/components/reduction.hpp +++ b/common/cuda_hip/components/reduction.hpp @@ -260,8 +260,8 @@ __launch_bounds__(default_reduce_block_size) void reduce_add_array_with_initial_ * @return the reduction result */ template -__host__ ValueType reduce_add_array(std::shared_ptr exec, - size_type size, const ValueType* source) +ValueType reduce_add_array(std::shared_ptr exec, + size_type size, const ValueType* source) { auto block_results_val = source; size_type grid_dim = size; diff --git a/common/cuda_hip/distributed/index_map_kernels.cpp b/common/cuda_hip/distributed/index_map_kernels.cpp index 744d0f5581f..e27c5221013 100644 --- a/common/cuda_hip/distributed/index_map_kernels.cpp +++ b/common/cuda_hip/distributed/index_map_kernels.cpp @@ -299,4 +299,4 @@ GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE( } // namespace index_map } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko \ No newline at end of file +} // namespace gko From 53be0aa665401be322c76f0d6eb645315ce83ba3 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 5 Jul 2024 09:00:15 +0200 Subject: [PATCH 054/448] Revert "add unification script" This reverts commit a0f5a82289392e2281b8fe2fc0713cf4043a480c. --- dev_tools/scripts/unify_cuda_hip.py | 135 ---------------------------- 1 file changed, 135 deletions(-) delete mode 100644 dev_tools/scripts/unify_cuda_hip.py diff --git a/dev_tools/scripts/unify_cuda_hip.py b/dev_tools/scripts/unify_cuda_hip.py deleted file mode 100644 index e359a69d1ff..00000000000 --- a/dev_tools/scripts/unify_cuda_hip.py +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env python3 -import sys -import os -import difflib -import subprocess - -common_filename = sys.argv[1] -base_filename = common_filename.replace("common/cuda_hip/", "").replace(".hpp.inc", "") -cuda_filename = next( - f"cuda/{base_filename}{extension}" - for extension in [".cu", ".cuh", ".cpp", ".hpp", ".template.cu"] - if os.path.exists(f"cuda/{base_filename}{extension}") -) -hip_filename = next( - f"hip/{base_filename}{extension}" - for extension in [".hip.cpp", ".hip.hpp", ".template.hip.cpp"] - if os.path.exists(f"hip/{base_filename}{extension}") -) -output_filename = f"common/cuda_hip/{base_filename}{'.cpp' if cuda_filename.endswith('.cu') else '.hpp'}" - -common_lines = list(open(common_filename))[3:] # remove license header -cuda_lines = list(open(cuda_filename)) -hip_lines = list(open(hip_filename)) - -cuda_file_guard = f"GKO_{cuda_filename.upper().replace('/', '_').replace('.','_')}_" -hip_file_guard = f"GKO_{hip_filename.upper().replace('/', '_').replace('.','_')}_" -common_file_guard = f"GKO_{common_filename.upper().replace('/', '_').replace('.','_')}_" - -cuda_lines = [ - line.replace(cuda_file_guard, common_file_guard) - .replace("namespace cuda", "namespace GKO_DEVICE_NAMESPACE") - .replace("CudaExecutor", "DefaultExecutor") - for line in cuda_lines -] -hip_lines = [ - line.replace(hip_file_guard, common_file_guard) - .replace("namespace hip", "namespace GKO_DEVICE_NAMESPACE") - .replace("HipExecutor", "DefaultExecutor") - for line in hip_lines -] - -for i in range(len(cuda_lines)): - if cuda_lines[i].startswith('#include "'): - cuda_lines[i] = ( - cuda_lines[i] - .replace('#include "cuda/', '#include "common/cuda_hip/') - .replace(".cuh", ".hpp") - .replace("cublas", "blas") - .replace("cusparse", "sparselib") - .replace("curand", "randlib") - ) - cuda_lines[i] = ( - cuda_lines[i] - .replace("cuda_range", "device_range") - .replace("cuda::", "GKO_DEVICE_NAMESPACE::") - ) -for i in range(len(hip_lines)): - if hip_lines[i].startswith('#include "'): - hip_lines[i] = ( - hip_lines[i] - .replace('#include "hip/', '#include "common/cuda_hip/') - .replace(".hip.hpp", ".hpp") - .replace("hipblas", "blas") - .replace("hipsparse", "sparselib") - .replace("hiprand", "randlib") - ) - hip_lines[i] = ( - hip_lines[i] - .replace("hip_range", "device_range") - .replace("hip::", "GKO_DEVICE_NAMESPACE::") - ) - -cuda_location = next( - i - for i, line in enumerate(cuda_lines) - if line.startswith(f'#include "{common_filename}"') -) -hip_location = next( - i - for i, line in enumerate(hip_lines) - if line.startswith(f'#include "{common_filename}"') -) -cuda_replaced = ( - cuda_lines[:cuda_location] + common_lines + cuda_lines[cuda_location + 1 :] -) -hip_replaced = hip_lines[:hip_location] + common_lines + hip_lines[hip_location + 1 :] - -cuda_replaced = ( - subprocess.run( - args=[ - "/home/tribizel/.cache/pre-commit/repoay30okq9/py_env-python3/lib64/python3.9/site-packages/clang_format/data/bin/clang-format", - f"-assume-filename={output_filename}", - "-", - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - input=bytes("".join(cuda_replaced), "utf-8"), - ) - .stdout.decode() - .splitlines() -) -hip_replaced = ( - subprocess.run( - args=[ - "/home/tribizel/.cache/pre-commit/repoay30okq9/py_env-python3/lib64/python3.9/site-packages/clang_format/data/bin/clang-format", - f"-assume-filename={output_filename}", - "-", - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - input=bytes("".join(hip_replaced), "utf-8"), - ) - .stdout.decode() - .splitlines() -) - -if cuda_replaced == hip_replaced: - with open(output_filename, "w") as file: - file.write("\n".join(cuda_replaced)) - os.remove(common_filename) - os.remove(cuda_filename) - os.remove(hip_filename) - with open("cuda_source_delete.sed", "a") as file: - file.write("/" + cuda_filename[5:].replace("/", "\\/") + "/d;") - with open("hip_source_delete.sed", "a") as file: - file.write("/" + hip_filename[4:].replace("/", "\\/") + "/d;") - with open("source_add.cmake", "a") as file: - file.write(f"{output_filename}\n") - sys.exit(0) -else: - print(common_filename) - print(cuda_filename) - print(hip_filename) - print("\n".join(difflib.unified_diff(cuda_replaced, hip_replaced))) - sys.exit(1) From 9e7a334cce2f6042a06c00f5b5a34bce22d65e26 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 31 May 2024 15:01:22 +0200 Subject: [PATCH 055/448] add debug logger for solvers --- core/CMakeLists.txt | 1 + core/log/solver_debug.cpp | 126 +++++++++++++++++++++++ core/test/log/CMakeLists.txt | 1 + core/test/log/solver_debug.cpp | 77 ++++++++++++++ include/ginkgo/core/log/solver_debug.hpp | 80 ++++++++++++++ include/ginkgo/ginkgo.hpp | 1 + 6 files changed, 286 insertions(+) create mode 100644 core/log/solver_debug.cpp create mode 100644 core/test/log/solver_debug.cpp create mode 100644 include/ginkgo/core/log/solver_debug.hpp diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 14ae6ce6592..56d35e8edf0 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -62,6 +62,7 @@ target_sources(${ginkgo_core} log/tau.cpp log/vtune.cpp log/record.cpp + log/solver_debug.cpp log/stream.cpp matrix/batch_csr.cpp matrix/batch_dense.cpp diff --git a/core/log/solver_debug.cpp b/core/log/solver_debug.cpp new file mode 100644 index 00000000000..be945233a61 --- /dev/null +++ b/core/log/solver_debug.cpp @@ -0,0 +1,126 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include + + +#include + + +#include +#include +#include +#include + + +namespace gko { +namespace log { + + +static void print_scalar(const LinOp* value, std::ostream& stream) +{ + using conv_to_double = ConvertibleTo>; + using conv_to_complex = ConvertibleTo>>; + const auto host_exec = value->get_executor()->get_master(); + if (value->get_size()[0] == 0) { + stream << ""; + } else if (value->get_size()[0] != 1) { + stream << ""; + } else if (dynamic_cast(value)) { + auto host_value = matrix::Dense::create(host_exec); + host_value->copy_from(value); + stream << host_value->at(0, 0); + } else if (dynamic_cast(value)) { + auto host_value = + matrix::Dense>::create(host_exec); + host_value->copy_from(value); + stream << host_value->at(0, 0); + } else { + stream << ""; + } +} + + +void SolverDebug::on_linop_apply_started(const LinOp* solver, const LinOp* in, + const LinOp* out) const +{ + using solver_base = solver::detail::SolverBaseLinOp; + auto dynamic_type = name_demangling::get_dynamic_type(*solver); + auto& stream = *output_; + stream << dynamic_type << "::apply(" << in << ',' << out + << ") of dimensions " << solver->get_size() << " and " + << in->get_size()[1] << " rhs\n"; + if (const auto base = dynamic_cast(solver)) { + const auto scalars = base->get_workspace_scalars(); + const auto names = base->get_workspace_op_names(); + stream << std::setw(column_width_) << "Iteration"; + for (auto scalar : scalars) { + stream << std::setw(column_width_) << names[scalar]; + } + stream << '\n'; + } else { + stream << "This solver type is not supported by the SolverDebug logger"; + } +} + + +void SolverDebug::on_iteration_complete( + const LinOp* solver, const LinOp* right_hand_side, const LinOp* solution, + const size_type& num_iterations, const LinOp* residual, + const LinOp* residual_norm, const LinOp* implicit_sq_residual_norm, + const array* status, bool stopped) const +{ + using solver_base = solver::detail::SolverBaseLinOp; + auto& stream = *output_; + stream << std::setprecision(precision_); + if (const auto base = dynamic_cast(solver)) { + const auto scalars = base->get_workspace_scalars(); + stream << std::setw(column_width_) << num_iterations; + for (auto scalar : scalars) { + stream << std::setw(column_width_); + print_scalar(base->get_workspace_op(scalar), stream); + } + stream << '\n'; + } +} + + +void SolverDebug::on_iteration_complete(const LinOp* solver, + const size_type& num_iterations, + const LinOp* residual, + const LinOp* solution, + const LinOp* residual_norm) const +{ + on_iteration_complete(solver, nullptr, solution, num_iterations, residual, + residual_norm, nullptr, nullptr, false); +} + + +void SolverDebug::on_iteration_complete( + const LinOp* solver, const size_type& num_iterations, const LinOp* residual, + const LinOp* solution, const LinOp* residual_norm, + const LinOp* implicit_sq_residual_norm) const +{ + on_iteration_complete(solver, nullptr, solution, num_iterations, residual, + residual_norm, implicit_sq_residual_norm, nullptr, + false); +} + + +SolverDebug::SolverDebug(std::ostream& stream, int precision, int column_width) + : output_{&stream}, precision_{precision}, column_width_{column_width} +{} + + +std::shared_ptr SolverDebug::create(std::ostream& output, + int precision, + int column_width) +{ + return std::shared_ptr{ + new SolverDebug{output, precision, column_width}}; +} + + +} // namespace log +} // namespace gko diff --git a/core/test/log/CMakeLists.txt b/core/test/log/CMakeLists.txt index 8efd7fafc46..1231b996f5a 100644 --- a/core/test/log/CMakeLists.txt +++ b/core/test/log/CMakeLists.txt @@ -6,4 +6,5 @@ endif() ginkgo_create_test(performance_hint) ginkgo_create_test(profiler_hook) ginkgo_create_test(record) +ginkgo_create_test(solver_debug) ginkgo_create_test(stream) diff --git a/core/test/log/solver_debug.cpp b/core/test/log/solver_debug.cpp new file mode 100644 index 00000000000..2b0ec771590 --- /dev/null +++ b/core/test/log/solver_debug.cpp @@ -0,0 +1,77 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include + + +#include + + +#include +#include +#include + + +#include "core/test/utils.hpp" + + +template +class SolverDebug : public ::testing::Test { +public: + using Dense = gko::matrix::Dense; + using Cg = gko::solver::Cg; + + SolverDebug() : ref{gko::ReferenceExecutor::create()} + { + mtx = gko::initialize({T{1.0}}, ref); + in = gko::initialize({T{2.0}}, ref); + out = mtx->clone(); + solver = + Cg::build() + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) + .on(ref) + ->generate(mtx); + } + + std::shared_ptr ref; + std::shared_ptr mtx; + std::shared_ptr in; + std::unique_ptr out; + std::unique_ptr solver; +}; + +TYPED_TEST_SUITE(SolverDebug, gko::test::ValueTypes, TypenameNameGenerator); + + +TYPED_TEST(SolverDebug, Works) +{ + using T = TypeParam; + std::stringstream ref_ss; + int default_column_width = 12; + auto dynamic_type = gko::name_demangling::get_dynamic_type(*this->solver); + ref_ss << dynamic_type << "::apply(" << this->in.get() << ',' + << this->out.get() << ") of dimensions " << this->solver->get_size() + << " and " << this->in->get_size()[1] << " rhs\n"; + ref_ss << std::setw(default_column_width) << "Iteration" + << std::setw(default_column_width) << "alpha" + << std::setw(default_column_width) << "beta" + << std::setw(default_column_width) << "prev_rho" + << std::setw(default_column_width) << "rho" << '\n'; + ref_ss << std::setw(default_column_width) << 0 + << std::setw(default_column_width) << T{0.0} + << std::setw(default_column_width) << T{0.0} + << std::setw(default_column_width) << T{1.0} + << std::setw(default_column_width) << T{1.0} << '\n' + << std::setw(default_column_width) << 1 + << std::setw(default_column_width) << T{0.0} + << std::setw(default_column_width) << T{1.0} + << std::setw(default_column_width) << T{0.0} + << std::setw(default_column_width) << T{1.0} << '\n'; + std::stringstream ss; + this->solver->add_logger(gko::log::SolverDebug::create(ss)); + + this->solver->apply(this->in, this->out); + + ASSERT_EQ(ss.str(), ref_ss.str()); +} diff --git a/include/ginkgo/core/log/solver_debug.hpp b/include/ginkgo/core/log/solver_debug.hpp new file mode 100644 index 00000000000..873a7a247cf --- /dev/null +++ b/include/ginkgo/core/log/solver_debug.hpp @@ -0,0 +1,80 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_PUBLIC_CORE_LOG_SOLVER_DEBUG_HPP_ +#define GKO_PUBLIC_CORE_LOG_SOLVER_DEBUG_HPP_ + + +#include + + +#include +#include + + +namespace gko { +namespace log { + + +/** + * This Logger prints the value of all scalar values stored internally by the + * solver after each iteration. If the solver is applied to multiple right-hand + * sides, only the first right-hand side gets printed. + */ +class SolverDebug : public Logger { +public: + /* Internal solver events */ + void on_linop_apply_started(const LinOp* A, const LinOp* b, + const LinOp* x) const override; + + void on_iteration_complete( + const LinOp* solver, const LinOp* right_hand_side, + const LinOp* solution, const size_type& num_iterations, + const LinOp* residual, const LinOp* residual_norm, + const LinOp* implicit_sq_residual_norm, + const array* status, bool stopped) const override; + + GKO_DEPRECATED( + "Please use the version with the additional stopping " + "information.") + void on_iteration_complete(const LinOp* solver, + const size_type& num_iterations, + const LinOp* residual, const LinOp* solution, + const LinOp* residual_norm) const override; + + GKO_DEPRECATED( + "Please use the version with the additional stopping " + "information.") + void on_iteration_complete( + const LinOp* solver, const size_type& num_iterations, + const LinOp* residual, const LinOp* solution, + const LinOp* residual_norm, + const LinOp* implicit_sq_residual_norm) const override; + + /** + * Creates a logger printing the value for all scalar values in the solver + * after each iteration. + * + * @param output the stream to write the output to. + * @param precision the number of digits of precision to print + * @param column_width the number of characters an output column is wide + */ + static std::shared_ptr create(std::ostream& output, + int precision = 6, + int column_width = 12); + +private: + SolverDebug(std::ostream& output, int precision, int column_width); + + std::ostream* output_; + int precision_; + int column_width_; +}; + + +} // namespace log +} // namespace gko + + +#endif // GKO_PUBLIC_CORE_LOG_SOLVER_DEBUG_HPP_ diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index 503b0143e09..2e307792c85 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -87,6 +87,7 @@ #include #include #include +#include #include #include From fef293b2b80c008c80efebdf218a38db02559c35 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 5 Jun 2024 10:09:14 +0200 Subject: [PATCH 056/448] add missing includes --- core/log/solver_debug.cpp | 1 + include/ginkgo/core/log/solver_debug.hpp | 1 + 2 files changed, 2 insertions(+) diff --git a/core/log/solver_debug.cpp b/core/log/solver_debug.cpp index be945233a61..760f182bde1 100644 --- a/core/log/solver_debug.cpp +++ b/core/log/solver_debug.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include diff --git a/include/ginkgo/core/log/solver_debug.hpp b/include/ginkgo/core/log/solver_debug.hpp index 873a7a247cf..9b589f29b88 100644 --- a/include/ginkgo/core/log/solver_debug.hpp +++ b/include/ginkgo/core/log/solver_debug.hpp @@ -7,6 +7,7 @@ #include +#include #include From 2e222abfedf0c88d58aba2f96423b82c188102c0 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 7 Jun 2024 14:57:29 +0200 Subject: [PATCH 057/448] add file and csv output loggers --- core/log/solver_debug.cpp | 336 ++++++++++++++++++----- core/test/log/solver_debug.cpp | 114 +++++++- include/ginkgo/core/log/solver_debug.hpp | 66 ++--- 3 files changed, 401 insertions(+), 115 deletions(-) diff --git a/core/log/solver_debug.cpp b/core/log/solver_debug.cpp index 760f182bde1..69b6a3ad692 100644 --- a/core/log/solver_debug.cpp +++ b/core/log/solver_debug.cpp @@ -5,121 +5,315 @@ #include +#include #include +#include +#include #include +#include +#include #include #include #include #include +#include "core/base/dispatch_helper.hpp" + + namespace gko { namespace log { +namespace { -static void print_scalar(const LinOp* value, std::ostream& stream) +template +static bool dispatch_type(const LinOp* value, Functor fn) { + const auto host_exec = value->get_executor()->get_master(); using conv_to_double = ConvertibleTo>; using conv_to_complex = ConvertibleTo>>; - const auto host_exec = value->get_executor()->get_master(); - if (value->get_size()[0] == 0) { - stream << ""; - } else if (value->get_size()[0] != 1) { - stream << ""; - } else if (dynamic_cast(value)) { - auto host_value = matrix::Dense::create(host_exec); - host_value->copy_from(value); - stream << host_value->at(0, 0); + if (dynamic_cast(value)) { + auto host_vec = matrix::Dense::create(host_exec); + host_vec->copy_from(value); + fn(host_vec.get()); + return true; } else if (dynamic_cast(value)) { - auto host_value = - matrix::Dense>::create(host_exec); - host_value->copy_from(value); - stream << host_value->at(0, 0); + auto host_vec = matrix::Dense>::create(host_exec); + host_vec->copy_from(value); + fn(host_vec.get()); + return true; } else { - stream << ""; + return false; } } -void SolverDebug::on_linop_apply_started(const LinOp* solver, const LinOp* in, - const LinOp* out) const -{ - using solver_base = solver::detail::SolverBaseLinOp; - auto dynamic_type = name_demangling::get_dynamic_type(*solver); - auto& stream = *output_; - stream << dynamic_type << "::apply(" << in << ',' << out - << ") of dimensions " << solver->get_size() << " and " - << in->get_size()[1] << " rhs\n"; - if (const auto base = dynamic_cast(solver)) { - const auto scalars = base->get_workspace_scalars(); - const auto names = base->get_workspace_op_names(); - stream << std::setw(column_width_) << "Iteration"; - for (auto scalar : scalars) { - stream << std::setw(column_width_) << names[scalar]; - } - stream << '\n'; - } else { - stream << "This solver type is not supported by the SolverDebug logger"; - } -} +class SolverDebugPrint : public SolverDebug { + friend class SolverDebug; +public: + /* Internal solver events */ + void on_linop_apply_started(const LinOp* solver, const LinOp* in, + const LinOp* out) const override + { + printed_header_ = false; + } -void SolverDebug::on_iteration_complete( - const LinOp* solver, const LinOp* right_hand_side, const LinOp* solution, - const size_type& num_iterations, const LinOp* residual, - const LinOp* residual_norm, const LinOp* implicit_sq_residual_norm, - const array* status, bool stopped) const -{ - using solver_base = solver::detail::SolverBaseLinOp; - auto& stream = *output_; - stream << std::setprecision(precision_); - if (const auto base = dynamic_cast(solver)) { + void on_iteration_complete( + const LinOp* solver, const LinOp* right_hand_side, + const LinOp* solution, const size_type& num_iterations, + const LinOp* residual, const LinOp* residual_norm, + const LinOp* implicit_sq_residual_norm, + const array* status, bool stopped) const override + { + using solver_base = solver::detail::SolverBaseLinOp; + auto dynamic_type = name_demangling::get_dynamic_type(*solver); + auto& stream = *output_; + auto base = gko::as(solver); + if (!printed_header_) { + stream << dynamic_type << "::apply(" << right_hand_side << ',' + << solution << ") of dimensions " << solver->get_size() + << " and " << right_hand_side->get_size()[1] << " rhs\n"; + const auto scalars = base->get_workspace_scalars(); + const auto names = base->get_workspace_op_names(); + stream << std::setw(column_width_) << "Iteration"; + for (auto scalar : scalars) { + if (separator_) { + stream << separator_; + } + stream << std::setw(column_width_) << names[scalar]; + } + if (residual_norm) { + if (separator_) { + stream << separator_; + } + stream << std::setw(column_width_) << "residual_norm"; + } + if (implicit_sq_residual_norm) { + if (separator_) { + stream << separator_; + } + stream << std::setw(column_width_) + << "implicit_sq_residual_norm"; + } + stream << '\n'; + printed_header_ = true; + } + stream << std::setprecision(precision_); const auto scalars = base->get_workspace_scalars(); stream << std::setw(column_width_) << num_iterations; for (auto scalar : scalars) { - stream << std::setw(column_width_); print_scalar(base->get_workspace_op(scalar), stream); } + if (residual_norm) { + print_scalar(residual_norm, stream); + } + if (implicit_sq_residual_norm) { + print_scalar(implicit_sq_residual_norm, stream); + } stream << '\n'; } -} + GKO_DEPRECATED( + "Please use the version with the additional stopping " + "information.") + void on_iteration_complete(const LinOp* solver, + const size_type& num_iterations, + const LinOp* residual, const LinOp* solution, + const LinOp* residual_norm) const override + { + on_iteration_complete(solver, nullptr, solution, num_iterations, + residual, residual_norm, nullptr, nullptr, false); + } -void SolverDebug::on_iteration_complete(const LinOp* solver, - const size_type& num_iterations, - const LinOp* residual, - const LinOp* solution, - const LinOp* residual_norm) const -{ - on_iteration_complete(solver, nullptr, solution, num_iterations, residual, - residual_norm, nullptr, nullptr, false); -} + GKO_DEPRECATED( + "Please use the version with the additional stopping " + "information.") + void on_iteration_complete( + const LinOp* solver, const size_type& num_iterations, + const LinOp* residual, const LinOp* solution, + const LinOp* residual_norm, + const LinOp* implicit_sq_residual_norm) const override + { + on_iteration_complete(solver, nullptr, solution, num_iterations, + residual, residual_norm, + implicit_sq_residual_norm, nullptr, false); + } + +private: + void print_scalar(const LinOp* value, std::ostream& stream) const + { + if (separator_) { + stream << separator_; + } + stream << std::setw(column_width_); + if (!value->get_size()) { + stream << ""; + } else if (value->get_size()[0] != 1) { + stream << ""; + } else { + if (!dispatch_type( + value, [&](auto vector) { stream << vector->at(0, 0); })) { + stream << ""; + } + } + } + SolverDebugPrint(std::ostream& output, int precision, int column_width, + char separator) + : output_{&output}, + precision_{precision}, + column_width_{column_width}, + separator_{separator}, + printed_header_(false) + {} -void SolverDebug::on_iteration_complete( - const LinOp* solver, const size_type& num_iterations, const LinOp* residual, - const LinOp* solution, const LinOp* residual_norm, - const LinOp* implicit_sq_residual_norm) const + std::ostream* output_; + int precision_; + int column_width_; + char separator_; + mutable bool printed_header_; +}; + + +class SolverDebugStore : public SolverDebug { + friend class SolverDebug; + +public: + /* Internal solver events */ + void on_linop_apply_started(const LinOp* solver, const LinOp* in, + const LinOp* out) const override + { + using solver_base = solver::detail::SolverBaseLinOp; + auto dynamic_type = name_demangling::get_dynamic_type(*solver); + auto base = gko::as(solver); + store_vector(base->get_system_matrix().get(), "system_matrix"); + store_vector(in, "rhs"); + store_vector(out, "initial_guess"); + } + + void on_iteration_complete( + const LinOp* solver, const LinOp* right_hand_side, + const LinOp* solution, const size_type& num_iterations, + const LinOp* residual, const LinOp* residual_norm, + const LinOp* implicit_sq_residual_norm, + const array* status, bool stopped) const override + { + using solver_base = solver::detail::SolverBaseLinOp; + auto base = gko::as(solver); + const auto num_vectors = base->get_num_workspace_ops(); + const auto names = base->get_workspace_op_names(); + for (int i = 0; i < num_vectors; i++) { + store_vector(base->get_workspace_op(i), num_iterations, + base->get_workspace_op_names()[i]); + } + store_vector(solution, num_iterations, "solution"); + store_vector(residual, num_iterations, "residual"); + store_vector(residual_norm, num_iterations, "residual_norm"); + store_vector(implicit_sq_residual_norm, num_iterations, + "implicit_sq_residual_norm"); + } + + GKO_DEPRECATED( + "Please use the version with the additional stopping " + "information.") + void on_iteration_complete(const LinOp* solver, + const size_type& num_iterations, + const LinOp* residual, const LinOp* solution, + const LinOp* residual_norm) const override + { + on_iteration_complete(solver, nullptr, solution, num_iterations, + residual, residual_norm, nullptr, nullptr, false); + } + + GKO_DEPRECATED( + "Please use the version with the additional stopping " + "information.") + void on_iteration_complete( + const LinOp* solver, const size_type& num_iterations, + const LinOp* residual, const LinOp* solution, + const LinOp* residual_norm, + const LinOp* implicit_sq_residual_norm) const override + { + on_iteration_complete(solver, nullptr, solution, num_iterations, + residual, residual_norm, + implicit_sq_residual_norm, nullptr, false); + } + +private: + void store_vector(const LinOp* value, const std::string& name) const + { + const auto filename = + output_file_prefix_ + "_" + name + (binary_ ? ".bin" : ".mtx"); + if (!value) { + return; + } + // putting Dense first here causes gko::write to use dense output + run, gko::matrix::Dense, + gko::matrix::Dense>, + gko::matrix::Dense>, + // fallback for other matrix types + gko::WritableToMatrixData, + gko::WritableToMatrixData, + gko::WritableToMatrixData, int32>, + gko::WritableToMatrixData, int32>, + gko::WritableToMatrixData, + gko::WritableToMatrixData, + gko::WritableToMatrixData, int64>, + gko::WritableToMatrixData, int64>>( + value, [&](auto vector) { + std::ofstream output{ + filename, binary_ ? (std::ios::out | std::ios::binary) + : std::ios::out}; + if (binary_) { + gko::write_binary(output, vector); + } else { + gko::write(output, vector); + } + }); + } + + void store_vector(const LinOp* value, size_type iteration, + const std::string& name) const + { + store_vector(value, std::to_string(iteration) + "_" + name); + } + + SolverDebugStore(std::string output_file_prefix, bool binary) + : output_file_prefix_{std::move(output_file_prefix)}, binary_{binary} + {} + + std::string output_file_prefix_; + bool binary_; +}; + + +} // namespace + + +std::shared_ptr SolverDebug::create_scalar_table( + std::ostream& output, int precision, int column_width) { - on_iteration_complete(solver, nullptr, solution, num_iterations, residual, - residual_norm, implicit_sq_residual_norm, nullptr, - false); + return std::shared_ptr{ + new SolverDebugPrint{output, precision, column_width, '\0'}}; } -SolverDebug::SolverDebug(std::ostream& stream, int precision, int column_width) - : output_{&stream}, precision_{precision}, column_width_{column_width} -{} +std::shared_ptr SolverDebug::create_scalar_csv( + std::ostream& output, int precision, char separator) +{ + return std::shared_ptr{ + new SolverDebugPrint{output, precision, 0, separator}}; +} -std::shared_ptr SolverDebug::create(std::ostream& output, - int precision, - int column_width) +std::shared_ptr SolverDebug::create_vector_storage( + std::string output_file_prefix, bool binary) { return std::shared_ptr{ - new SolverDebug{output, precision, column_width}}; + new SolverDebugStore{output_file_prefix, binary}}; } diff --git a/core/test/log/solver_debug.cpp b/core/test/log/solver_debug.cpp index 2b0ec771590..ec1f76c6fa7 100644 --- a/core/test/log/solver_debug.cpp +++ b/core/test/log/solver_debug.cpp @@ -14,6 +14,7 @@ #include "core/test/utils.hpp" +#include "core/test/utils/assertions.hpp" template @@ -26,7 +27,8 @@ class SolverDebug : public ::testing::Test { { mtx = gko::initialize({T{1.0}}, ref); in = gko::initialize({T{2.0}}, ref); - out = mtx->clone(); + out = gko::initialize({T{4.0}}, ref); + zero = gko::initialize({T{0.0}}, ref); solver = Cg::build() .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) @@ -34,17 +36,39 @@ class SolverDebug : public ::testing::Test { ->generate(mtx); } + template + void assert_file_equals(const std::string& filename, Mtx* ref_mtx) + { + auto cleanup = [filename] { + std::remove((filename + ".mtx").c_str()); + std::remove((filename + ".bin").c_str()); + }; + if (!ref_mtx) { + cleanup(); + return; + } + SCOPED_TRACE(filename); + std::ifstream stream_mtx{filename + ".mtx"}; + std::ifstream stream_bin{filename + ".bin", std::ios::binary}; + auto mtx = gko::read(stream_mtx, ref); + auto mtx_bin = gko::read_binary(stream_bin, ref); + cleanup(); + GKO_ASSERT_MTX_NEAR(mtx, ref_mtx, 0.0); + GKO_ASSERT_MTX_NEAR(mtx_bin, ref_mtx, 0.0); + } + std::shared_ptr ref; std::shared_ptr mtx; std::shared_ptr in; std::unique_ptr out; + std::unique_ptr zero; std::unique_ptr solver; }; TYPED_TEST_SUITE(SolverDebug, gko::test::ValueTypes, TypenameNameGenerator); -TYPED_TEST(SolverDebug, Works) +TYPED_TEST(SolverDebug, TableWorks) { using T = TypeParam; std::stringstream ref_ss; @@ -57,21 +81,97 @@ TYPED_TEST(SolverDebug, Works) << std::setw(default_column_width) << "alpha" << std::setw(default_column_width) << "beta" << std::setw(default_column_width) << "prev_rho" - << std::setw(default_column_width) << "rho" << '\n'; + << std::setw(default_column_width) << "rho" + << std::setw(default_column_width) << "implicit_sq_residual_norm" + << '\n'; ref_ss << std::setw(default_column_width) << 0 << std::setw(default_column_width) << T{0.0} << std::setw(default_column_width) << T{0.0} << std::setw(default_column_width) << T{1.0} - << std::setw(default_column_width) << T{1.0} << '\n' + << std::setw(default_column_width) << T{4.0} + << std::setw(default_column_width) << T{4.0} << '\n' << std::setw(default_column_width) << 1 << std::setw(default_column_width) << T{0.0} - << std::setw(default_column_width) << T{1.0} + << std::setw(default_column_width) << T{4.0} << std::setw(default_column_width) << T{0.0} - << std::setw(default_column_width) << T{1.0} << '\n'; + << std::setw(default_column_width) << T{4.0} + << std::setw(default_column_width) << T{0.0} << '\n'; + std::stringstream ss; + this->solver->add_logger(gko::log::SolverDebug::create_scalar_table(ss)); + + this->solver->apply(this->in, this->out); + + ASSERT_EQ(ss.str(), ref_ss.str()); +} + + +TYPED_TEST(SolverDebug, CsvWorks) +{ + using T = TypeParam; + std::stringstream ref_ss; + auto dynamic_type = gko::name_demangling::get_dynamic_type(*this->solver); + ref_ss << dynamic_type << "::apply(" << this->in.get() << ',' + << this->out.get() << ") of dimensions " << this->solver->get_size() + << " and " << this->in->get_size()[1] << " rhs\n"; + ref_ss << "Iteration,alpha,beta,prev_rho,rho,implicit_sq_residual_norm" + << '\n'; + ref_ss << 0 << ',' << T{0.0} << ',' << T{0.0} << ',' << T{1.0} << ',' + << T{4.0} << ',' << T{4.0} << '\n' + << 1 << ',' << T{0.0} << ',' << T{4.0} << ',' << T{0.0} << ',' + << T{4.0} << ',' << T{0.0} << '\n'; std::stringstream ss; - this->solver->add_logger(gko::log::SolverDebug::create(ss)); + this->solver->add_logger(gko::log::SolverDebug::create_scalar_csv(ss)); this->solver->apply(this->in, this->out); ASSERT_EQ(ss.str(), ref_ss.str()); } + + +TYPED_TEST(SolverDebug, StorageWorks) +{ + using T = TypeParam; + using Dense = typename TestFixture::Dense; + auto orig_out = this->out->clone(); + auto init_residual = gko::initialize({T{-2.0}}, this->ref); + std::vector> files{ + {"solver_debug_test_0_alpha", this->zero.get()}, + {"solver_debug_test_0_beta", nullptr}, + {"solver_debug_test_0_implicit_sq_residual_norm", orig_out.get()}, + {"solver_debug_test_0_minus_one", nullptr}, + {"solver_debug_test_0_one", nullptr}, + {"solver_debug_test_0_p", nullptr}, + {"solver_debug_test_0_prev_rho", nullptr}, + {"solver_debug_test_0_q", nullptr}, + {"solver_debug_test_0_r", nullptr}, + {"solver_debug_test_0_residual", init_residual.get()}, + {"solver_debug_test_0_rho", nullptr}, + {"solver_debug_test_0_solution", orig_out.get()}, + {"solver_debug_test_0_z", nullptr}, + {"solver_debug_test_1_alpha", nullptr}, + {"solver_debug_test_1_beta", nullptr}, + {"solver_debug_test_1_implicit_sq_residual_norm", this->zero.get()}, + {"solver_debug_test_1_minus_one", nullptr}, + {"solver_debug_test_1_one", nullptr}, + {"solver_debug_test_1_p", nullptr}, + {"solver_debug_test_1_prev_rho", nullptr}, + {"solver_debug_test_1_q", nullptr}, + {"solver_debug_test_1_r", nullptr}, + {"solver_debug_test_1_residual", this->zero.get()}, + {"solver_debug_test_1_rho", nullptr}, + {"solver_debug_test_1_solution", this->in.get()}, + {"solver_debug_test_1_z", nullptr}, + {"solver_debug_test_initial_guess", orig_out.get()}, + {"solver_debug_test_rhs", this->in.get()}, + {"solver_debug_test_system_matrix", this->mtx.get()}}; + this->solver->add_logger(gko::log::SolverDebug::create_vector_storage( + "solver_debug_test", false)); + this->solver->add_logger(gko::log::SolverDebug::create_vector_storage( + "solver_debug_test", true)); + + this->solver->apply(this->in, this->out); + + for (auto pair : files) { + this->assert_file_equals(pair.first, pair.second); + } +} diff --git a/include/ginkgo/core/log/solver_debug.hpp b/include/ginkgo/core/log/solver_debug.hpp index 9b589f29b88..85e38338da2 100644 --- a/include/ginkgo/core/log/solver_debug.hpp +++ b/include/ginkgo/core/log/solver_debug.hpp @@ -10,7 +10,6 @@ #include -#include #include @@ -25,52 +24,45 @@ namespace log { */ class SolverDebug : public Logger { public: - /* Internal solver events */ - void on_linop_apply_started(const LinOp* A, const LinOp* b, - const LinOp* x) const override; - - void on_iteration_complete( - const LinOp* solver, const LinOp* right_hand_side, - const LinOp* solution, const size_type& num_iterations, - const LinOp* residual, const LinOp* residual_norm, - const LinOp* implicit_sq_residual_norm, - const array* status, bool stopped) const override; - - GKO_DEPRECATED( - "Please use the version with the additional stopping " - "information.") - void on_iteration_complete(const LinOp* solver, - const size_type& num_iterations, - const LinOp* residual, const LinOp* solution, - const LinOp* residual_norm) const override; - - GKO_DEPRECATED( - "Please use the version with the additional stopping " - "information.") - void on_iteration_complete( - const LinOp* solver, const size_type& num_iterations, - const LinOp* residual, const LinOp* solution, - const LinOp* residual_norm, - const LinOp* implicit_sq_residual_norm) const override; + /** + * Creates a logger printing the value for all scalar values in the solver + * after each iteration in an ASCII table. + * + * @param output the stream to write the output to. + * @param precision the number of digits of precision to print + * @param column_width the number of characters an output column is wide + */ + static std::shared_ptr create_scalar_table( + std::ostream& output, int precision = 6, int column_width = 12); + /** * Creates a logger printing the value for all scalar values in the solver - * after each iteration. + * after each iteration in a CSV table. * * @param output the stream to write the output to. * @param precision the number of digits of precision to print * @param column_width the number of characters an output column is wide */ - static std::shared_ptr create(std::ostream& output, - int precision = 6, - int column_width = 12); + static std::shared_ptr create_scalar_csv(std::ostream& output, + int precision = 6, + char separator = ','); -private: - SolverDebug(std::ostream& output, int precision, int column_width); - std::ostream* output_; - int precision_; - int column_width_; + /** + * Creates a logger storing all vectors and scalar values in the solver + * after each iteration on disk. + * + * @param output the path and file name prefix used to generate the output + * file names. + * @param precision the number of digits of precision to print when + * outputting matrices in text format + * @param binary if true, write data in Ginkgo's own binary format + * (lossless), if false write data in the MatrixMarket format + * (potentially lossy) + */ + static std::shared_ptr create_vector_storage( + std::string output_file_prefix = "solver_", bool binary = false); }; From 56d5ef6bb253e201b64baef9ff6daae406d3e7bb Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 3 Jul 2024 19:11:00 +0200 Subject: [PATCH 058/448] improve naming and documentation --- core/log/solver_debug.cpp | 34 +++++++++++++++++------- core/test/log/solver_debug.cpp | 22 ++++++++------- include/ginkgo/core/log/solver_debug.hpp | 22 ++++++++------- 3 files changed, 50 insertions(+), 28 deletions(-) diff --git a/core/log/solver_debug.cpp b/core/log/solver_debug.cpp index 69b6a3ad692..e1acb233d03 100644 --- a/core/log/solver_debug.cpp +++ b/core/log/solver_debug.cpp @@ -2,24 +2,20 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include #include #include #include - #include #include #include #include #include +#include #include #include - #include "core/base/dispatch_helper.hpp" @@ -28,6 +24,15 @@ namespace log { namespace { +bool is_dense(const LinOp* value) +{ + using conv_to_double = ConvertibleTo>; + using conv_to_complex = ConvertibleTo>>; + return dynamic_cast(value) || + dynamic_cast(value); +} + + template static bool dispatch_type(const LinOp* value, Functor fn) { @@ -154,8 +159,19 @@ class SolverDebugPrint : public SolverDebug { } else if (value->get_size()[0] != 1) { stream << ""; } else { - if (!dispatch_type( - value, [&](auto vector) { stream << vector->at(0, 0); })) { + if (is_dense(value)) { + auto host_exec = value->get_executor()->get_master(); + run>, + ConvertibleTo>>>( + value, [&](auto vector) { + using vector_type = typename detail::pointee< + decltype(vector)>::result_type; + auto host_vec = vector_type::create(host_exec); + host_vec->copy_from(value); + stream << host_vec->at(0, 0); + }); + + } else { stream << ""; } } @@ -293,7 +309,7 @@ class SolverDebugStore : public SolverDebug { } // namespace -std::shared_ptr SolverDebug::create_scalar_table( +std::shared_ptr SolverDebug::create_scalar_table_writer( std::ostream& output, int precision, int column_width) { return std::shared_ptr{ @@ -301,7 +317,7 @@ std::shared_ptr SolverDebug::create_scalar_table( } -std::shared_ptr SolverDebug::create_scalar_csv( +std::shared_ptr SolverDebug::create_scalar_csv_writer( std::ostream& output, int precision, char separator) { return std::shared_ptr{ diff --git a/core/test/log/solver_debug.cpp b/core/test/log/solver_debug.cpp index ec1f76c6fa7..216e14289da 100644 --- a/core/test/log/solver_debug.cpp +++ b/core/test/log/solver_debug.cpp @@ -2,17 +2,13 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include - - #include - #include +#include #include #include - #include "core/test/utils.hpp" #include "core/test/utils/assertions.hpp" @@ -39,17 +35,21 @@ class SolverDebug : public ::testing::Test { template void assert_file_equals(const std::string& filename, Mtx* ref_mtx) { + SCOPED_TRACE(filename); auto cleanup = [filename] { std::remove((filename + ".mtx").c_str()); std::remove((filename + ".bin").c_str()); }; + std::ifstream stream_mtx{filename + ".mtx"}; + std::ifstream stream_bin{filename + ".bin", std::ios::binary}; + // check that the files exist + ASSERT_TRUE(stream_mtx.good()); + ASSERT_TRUE(stream_bin.good()); if (!ref_mtx) { cleanup(); return; } - SCOPED_TRACE(filename); - std::ifstream stream_mtx{filename + ".mtx"}; - std::ifstream stream_bin{filename + ".bin", std::ios::binary}; + // check that the files have the correct contents auto mtx = gko::read(stream_mtx, ref); auto mtx_bin = gko::read_binary(stream_bin, ref); cleanup(); @@ -97,7 +97,8 @@ TYPED_TEST(SolverDebug, TableWorks) << std::setw(default_column_width) << T{4.0} << std::setw(default_column_width) << T{0.0} << '\n'; std::stringstream ss; - this->solver->add_logger(gko::log::SolverDebug::create_scalar_table(ss)); + this->solver->add_logger( + gko::log::SolverDebug::create_scalar_table_writer(ss)); this->solver->apply(this->in, this->out); @@ -120,7 +121,8 @@ TYPED_TEST(SolverDebug, CsvWorks) << 1 << ',' << T{0.0} << ',' << T{4.0} << ',' << T{0.0} << ',' << T{4.0} << ',' << T{0.0} << '\n'; std::stringstream ss; - this->solver->add_logger(gko::log::SolverDebug::create_scalar_csv(ss)); + this->solver->add_logger( + gko::log::SolverDebug::create_scalar_csv_writer(ss)); this->solver->apply(this->in, this->out); diff --git a/include/ginkgo/core/log/solver_debug.hpp b/include/ginkgo/core/log/solver_debug.hpp index 85e38338da2..98db712cc44 100644 --- a/include/ginkgo/core/log/solver_debug.hpp +++ b/include/ginkgo/core/log/solver_debug.hpp @@ -9,7 +9,6 @@ #include #include - #include @@ -18,40 +17,45 @@ namespace log { /** - * This Logger prints the value of all scalar values stored internally by the - * solver after each iteration. If the solver is applied to multiple right-hand - * sides, only the first right-hand side gets printed. + * This Logger outputs the value of all scalar values (and potentially vectors) + * stored internally by the solver after each iteration. It needs to be attached + * to the solver being debugged. */ class SolverDebug : public Logger { public: /** * Creates a logger printing the value for all scalar values in the solver * after each iteration in an ASCII table. + * If the solver is applied to multiple right-hand sides, only the first + * right-hand side gets printed. * * @param output the stream to write the output to. * @param precision the number of digits of precision to print * @param column_width the number of characters an output column is wide */ - static std::shared_ptr create_scalar_table( + static std::shared_ptr create_scalar_table_writer( std::ostream& output, int precision = 6, int column_width = 12); /** * Creates a logger printing the value for all scalar values in the solver * after each iteration in a CSV table. + * If the solver is applied to multiple right-hand sides, only the first + * right-hand side gets printed. * * @param output the stream to write the output to. * @param precision the number of digits of precision to print - * @param column_width the number of characters an output column is wide + * @param separator the character separating columns from each other */ - static std::shared_ptr create_scalar_csv(std::ostream& output, - int precision = 6, - char separator = ','); + static std::shared_ptr create_scalar_csv_writer( + std::ostream& output, int precision = 6, char separator = ','); /** * Creates a logger storing all vectors and scalar values in the solver * after each iteration on disk. + * This logger can handle multiple right-hand sides, in contrast to + * create_scalar_table_writer or create_scalar_csv_writer. * * @param output the path and file name prefix used to generate the output * file names. From 4a796981a22bebe758abc532b7b8403839591437 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 4 Jul 2024 13:17:37 +0200 Subject: [PATCH 059/448] clean uninitialized values from comparison --- core/test/log/solver_debug.cpp | 35 +++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/core/test/log/solver_debug.cpp b/core/test/log/solver_debug.cpp index 216e14289da..90108116374 100644 --- a/core/test/log/solver_debug.cpp +++ b/core/test/log/solver_debug.cpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + #include #include @@ -78,20 +80,17 @@ TYPED_TEST(SolverDebug, TableWorks) << this->out.get() << ") of dimensions " << this->solver->get_size() << " and " << this->in->get_size()[1] << " rhs\n"; ref_ss << std::setw(default_column_width) << "Iteration" - << std::setw(default_column_width) << "alpha" << std::setw(default_column_width) << "beta" << std::setw(default_column_width) << "prev_rho" << std::setw(default_column_width) << "rho" << std::setw(default_column_width) << "implicit_sq_residual_norm" << '\n'; ref_ss << std::setw(default_column_width) << 0 - << std::setw(default_column_width) << T{0.0} << std::setw(default_column_width) << T{0.0} << std::setw(default_column_width) << T{1.0} << std::setw(default_column_width) << T{4.0} << std::setw(default_column_width) << T{4.0} << '\n' << std::setw(default_column_width) << 1 - << std::setw(default_column_width) << T{0.0} << std::setw(default_column_width) << T{4.0} << std::setw(default_column_width) << T{0.0} << std::setw(default_column_width) << T{4.0} @@ -102,7 +101,12 @@ TYPED_TEST(SolverDebug, TableWorks) this->solver->apply(this->in, this->out); - ASSERT_EQ(ss.str(), ref_ss.str()); + // the first value of beta is uninitialized, so we need to remove it + std::regex first_beta("\n 0 *[()0-9.e,+-]*"); + auto clean_str = std::regex_replace(ss.str(), first_beta, "\n 0"); + auto clean_ref = + std::regex_replace(ref_ss.str(), first_beta, "\n 0"); + ASSERT_EQ(clean_str, clean_ref); } @@ -114,19 +118,22 @@ TYPED_TEST(SolverDebug, CsvWorks) ref_ss << dynamic_type << "::apply(" << this->in.get() << ',' << this->out.get() << ") of dimensions " << this->solver->get_size() << " and " << this->in->get_size()[1] << " rhs\n"; - ref_ss << "Iteration,alpha,beta,prev_rho,rho,implicit_sq_residual_norm" - << '\n'; - ref_ss << 0 << ',' << T{0.0} << ',' << T{0.0} << ',' << T{1.0} << ',' - << T{4.0} << ',' << T{4.0} << '\n' - << 1 << ',' << T{0.0} << ',' << T{4.0} << ',' << T{0.0} << ',' - << T{4.0} << ',' << T{0.0} << '\n'; + ref_ss << "Iteration;beta;prev_rho;rho;implicit_sq_residual_norm" << '\n'; + ref_ss << 0 << ';' << T{0.0} << ';' << T{1.0} << ';' << T{4.0} << ';' + << T{4.0} << '\n' + << 1 << ';' << T{4.0} << ';' << T{0.0} << ';' << T{4.0} << ';' + << T{0.0} << '\n'; std::stringstream ss; this->solver->add_logger( - gko::log::SolverDebug::create_scalar_csv_writer(ss)); + gko::log::SolverDebug::create_scalar_csv_writer(ss, 6, ';')); this->solver->apply(this->in, this->out); - ASSERT_EQ(ss.str(), ref_ss.str()); + // the first value of beta is uninitialized, so we need to remove it + std::regex first_beta("\n0;[^;]*"); + auto clean_str = std::regex_replace(ss.str(), first_beta, "\n0;"); + auto clean_ref = std::regex_replace(ref_ss.str(), first_beta, "\n0;"); + ASSERT_EQ(clean_str, clean_ref); } @@ -137,7 +144,6 @@ TYPED_TEST(SolverDebug, StorageWorks) auto orig_out = this->out->clone(); auto init_residual = gko::initialize({T{-2.0}}, this->ref); std::vector> files{ - {"solver_debug_test_0_alpha", this->zero.get()}, {"solver_debug_test_0_beta", nullptr}, {"solver_debug_test_0_implicit_sq_residual_norm", orig_out.get()}, {"solver_debug_test_0_minus_one", nullptr}, @@ -150,8 +156,7 @@ TYPED_TEST(SolverDebug, StorageWorks) {"solver_debug_test_0_rho", nullptr}, {"solver_debug_test_0_solution", orig_out.get()}, {"solver_debug_test_0_z", nullptr}, - {"solver_debug_test_1_alpha", nullptr}, - {"solver_debug_test_1_beta", nullptr}, + {"solver_debug_test_1_beta", orig_out.get()}, {"solver_debug_test_1_implicit_sq_residual_norm", this->zero.get()}, {"solver_debug_test_1_minus_one", nullptr}, {"solver_debug_test_1_one", nullptr}, From 369bfe63363240a476347529becfc3f5aa01d9bd Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 4 Jul 2024 13:19:41 +0200 Subject: [PATCH 060/448] remove unused scalars --- core/solver/bicg.cpp | 9 ++++----- core/solver/cg.cpp | 8 +++----- core/solver/fcg.cpp | 9 ++++----- include/ginkgo/core/solver/bicg.hpp | 12 +++++------- include/ginkgo/core/solver/cg.hpp | 12 +++++------- include/ginkgo/core/solver/fcg.hpp | 14 ++++++-------- 6 files changed, 27 insertions(+), 37 deletions(-) diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp index c379cb8df08..0b39b3664cc 100644 --- a/core/solver/bicg.cpp +++ b/core/solver/bicg.cpp @@ -124,7 +124,6 @@ void Bicg::apply_dense_impl(const matrix::Dense* dense_b, GKO_SOLVER_VECTOR(p2, dense_b); GKO_SOLVER_VECTOR(q2, dense_b); - GKO_SOLVER_SCALAR(alpha, dense_b); GKO_SOLVER_SCALAR(beta, dense_b); GKO_SOLVER_SCALAR(prev_rho, dense_b); GKO_SOLVER_SCALAR(rho, dense_b); @@ -255,7 +254,7 @@ int workspace_traits>::num_arrays(const Solver&) template int workspace_traits>::num_vectors(const Solver&) { - return 14; + return 13; } @@ -264,8 +263,8 @@ std::vector workspace_traits>::op_names( const Solver&) { return { - "r", "z", "p", "q", "r2", "z2", "p2", - "q2", "alpha", "beta", "prev_rho", "rho", "one", "minus_one", + "r", "z", "p", "q", "r2", "z2", "p2", + "q2", "beta", "prev_rho", "rho", "one", "minus_one", }; } @@ -281,7 +280,7 @@ std::vector workspace_traits>::array_names( template std::vector workspace_traits>::scalars(const Solver&) { - return {alpha, beta, prev_rho, rho}; + return {beta, prev_rho, rho}; } diff --git a/core/solver/cg.cpp b/core/solver/cg.cpp index 20487b4cd0d..c512dc4313b 100644 --- a/core/solver/cg.cpp +++ b/core/solver/cg.cpp @@ -102,7 +102,6 @@ void Cg::apply_dense_impl(const VectorType* dense_b, GKO_SOLVER_VECTOR(p, dense_b); GKO_SOLVER_VECTOR(q, dense_b); - GKO_SOLVER_SCALAR(alpha, dense_b); GKO_SOLVER_SCALAR(beta, dense_b); GKO_SOLVER_SCALAR(prev_rho, dense_b); GKO_SOLVER_SCALAR(rho, dense_b); @@ -206,7 +205,7 @@ int workspace_traits>::num_arrays(const Solver&) template int workspace_traits>::num_vectors(const Solver&) { - return 10; + return 9; } @@ -215,8 +214,7 @@ std::vector workspace_traits>::op_names( const Solver&) { return { - "r", "z", "p", "q", "alpha", - "beta", "prev_rho", "rho", "one", "minus_one", + "r", "z", "p", "q", "beta", "prev_rho", "rho", "one", "minus_one", }; } @@ -232,7 +230,7 @@ std::vector workspace_traits>::array_names( template std::vector workspace_traits>::scalars(const Solver&) { - return {alpha, beta, prev_rho, rho}; + return {beta, prev_rho, rho}; } diff --git a/core/solver/fcg.cpp b/core/solver/fcg.cpp index c4f79854c0a..6c65f63ccae 100644 --- a/core/solver/fcg.cpp +++ b/core/solver/fcg.cpp @@ -102,7 +102,6 @@ void Fcg::apply_dense_impl(const VectorType* dense_b, GKO_SOLVER_VECTOR(q, dense_b); GKO_SOLVER_VECTOR(t, dense_b); - GKO_SOLVER_SCALAR(alpha, dense_b); GKO_SOLVER_SCALAR(beta, dense_b); GKO_SOLVER_SCALAR(prev_rho, dense_b); GKO_SOLVER_SCALAR(rho, dense_b); @@ -209,7 +208,7 @@ int workspace_traits>::num_arrays(const Solver&) template int workspace_traits>::num_vectors(const Solver&) { - return 12; + return 11; } @@ -218,8 +217,8 @@ std::vector workspace_traits>::op_names( const Solver&) { return { - "r", "z", "p", "q", "t", "alpha", - "beta", "prev_rho", "rho", "rho_t", "one", "minus_one", + "r", "z", "p", "q", "t", "beta", + "prev_rho", "rho", "rho_t", "one", "minus_one", }; } @@ -235,7 +234,7 @@ std::vector workspace_traits>::array_names( template std::vector workspace_traits>::scalars(const Solver&) { - return {alpha, beta, prev_rho, rho, rho_t}; + return {beta, prev_rho, rho, rho_t}; } diff --git a/include/ginkgo/core/solver/bicg.hpp b/include/ginkgo/core/solver/bicg.hpp index 9f1ef54cc34..2a43c1ca3f8 100644 --- a/include/ginkgo/core/solver/bicg.hpp +++ b/include/ginkgo/core/solver/bicg.hpp @@ -155,18 +155,16 @@ struct workspace_traits> { constexpr static int p2 = 6; // "transposed" q vector constexpr static int q2 = 7; - // alpha scalar - constexpr static int alpha = 8; // beta scalar - constexpr static int beta = 9; + constexpr static int beta = 8; // previous rho scalar - constexpr static int prev_rho = 10; + constexpr static int prev_rho = 9; // current rho scalar - constexpr static int rho = 11; + constexpr static int rho = 10; // constant 1.0 scalar - constexpr static int one = 12; + constexpr static int one = 11; // constant -1.0 scalar - constexpr static int minus_one = 13; + constexpr static int minus_one = 12; // stopping status array constexpr static int stop = 0; diff --git a/include/ginkgo/core/solver/cg.hpp b/include/ginkgo/core/solver/cg.hpp index 9d850ecbe6d..984d5d1f104 100644 --- a/include/ginkgo/core/solver/cg.hpp +++ b/include/ginkgo/core/solver/cg.hpp @@ -141,18 +141,16 @@ struct workspace_traits> { constexpr static int p = 2; // q vector constexpr static int q = 3; - // alpha scalar - constexpr static int alpha = 4; // beta scalar - constexpr static int beta = 5; + constexpr static int beta = 4; // previous rho scalar - constexpr static int prev_rho = 6; + constexpr static int prev_rho = 5; // current rho scalar - constexpr static int rho = 7; + constexpr static int rho = 6; // constant 1.0 scalar - constexpr static int one = 8; + constexpr static int one = 7; // constant -1.0 scalar - constexpr static int minus_one = 9; + constexpr static int minus_one = 8; // stopping status array constexpr static int stop = 0; diff --git a/include/ginkgo/core/solver/fcg.hpp b/include/ginkgo/core/solver/fcg.hpp index 4577dd1b1d4..dfaf252b557 100644 --- a/include/ginkgo/core/solver/fcg.hpp +++ b/include/ginkgo/core/solver/fcg.hpp @@ -148,20 +148,18 @@ struct workspace_traits> { constexpr static int q = 3; // t vector constexpr static int t = 4; - // alpha scalar - constexpr static int alpha = 5; // beta scalar - constexpr static int beta = 6; + constexpr static int beta = 5; // previous rho scalar - constexpr static int prev_rho = 7; + constexpr static int prev_rho = 6; // current rho scalar - constexpr static int rho = 8; + constexpr static int rho = 7; // current rho_t scalar - constexpr static int rho_t = 9; + constexpr static int rho_t = 8; // constant 1.0 scalar - constexpr static int one = 10; + constexpr static int one = 9; // constant -1.0 scalar - constexpr static int minus_one = 11; + constexpr static int minus_one = 10; // stopping status array constexpr static int stop = 0; From 889756225cd3b0e1a6b22bd432b510747d2ddf91 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 4 Jul 2024 13:36:54 +0200 Subject: [PATCH 061/448] clean up and simplify code --- core/log/solver_debug.cpp | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/core/log/solver_debug.cpp b/core/log/solver_debug.cpp index e1acb233d03..66febeb9ea4 100644 --- a/core/log/solver_debug.cpp +++ b/core/log/solver_debug.cpp @@ -33,28 +33,6 @@ bool is_dense(const LinOp* value) } -template -static bool dispatch_type(const LinOp* value, Functor fn) -{ - const auto host_exec = value->get_executor()->get_master(); - using conv_to_double = ConvertibleTo>; - using conv_to_complex = ConvertibleTo>>; - if (dynamic_cast(value)) { - auto host_vec = matrix::Dense::create(host_exec); - host_vec->copy_from(value); - fn(host_vec.get()); - return true; - } else if (dynamic_cast(value)) { - auto host_vec = matrix::Dense>::create(host_exec); - host_vec->copy_from(value); - fn(host_vec.get()); - return true; - } else { - return false; - } -} - - class SolverDebugPrint : public SolverDebug { friend class SolverDebug; @@ -167,7 +145,7 @@ class SolverDebugPrint : public SolverDebug { using vector_type = typename detail::pointee< decltype(vector)>::result_type; auto host_vec = vector_type::create(host_exec); - host_vec->copy_from(value); + vector->convert_to(host_vec); stream << host_vec->at(0, 0); }); From 9826e1b3a69598d76fe57afb78ab3c52550ad72b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 4 Jul 2024 14:14:12 +0200 Subject: [PATCH 062/448] simplify condition Co-authored-by: Marcel Koch --- core/log/solver_debug.cpp | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/core/log/solver_debug.cpp b/core/log/solver_debug.cpp index 66febeb9ea4..b93a7987326 100644 --- a/core/log/solver_debug.cpp +++ b/core/log/solver_debug.cpp @@ -136,22 +136,20 @@ class SolverDebugPrint : public SolverDebug { stream << ""; } else if (value->get_size()[0] != 1) { stream << ""; + } else if (is_dense(value)) { + auto host_exec = value->get_executor()->get_master(); + run>, + ConvertibleTo>>>( + value, [&](auto vector) { + using vector_type = + typename detail::pointee::result_type; + auto host_vec = vector_type::create(host_exec); + vector->convert_to(host_vec); + stream << host_vec->at(0, 0); + }); + } else { - if (is_dense(value)) { - auto host_exec = value->get_executor()->get_master(); - run>, - ConvertibleTo>>>( - value, [&](auto vector) { - using vector_type = typename detail::pointee< - decltype(vector)>::result_type; - auto host_vec = vector_type::create(host_exec); - vector->convert_to(host_vec); - stream << host_vec->at(0, 0); - }); - - } else { - stream << ""; - } + stream << ""; } } From 3669147c7ff194b65e26e64b53e924cef4010e73 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 5 Jul 2024 13:11:36 +0200 Subject: [PATCH 063/448] Rename solver_debug to solver_progress --- core/CMakeLists.txt | 2 +- .../{solver_debug.cpp => solver_progress.cpp} | 35 ++++---- core/test/log/CMakeLists.txt | 2 +- .../{solver_debug.cpp => solver_progress.cpp} | 80 +++++++++---------- .../{solver_debug.hpp => solver_progress.hpp} | 16 ++-- include/ginkgo/ginkgo.hpp | 2 +- 6 files changed, 69 insertions(+), 68 deletions(-) rename core/log/{solver_debug.cpp => solver_progress.cpp} (91%) rename core/test/log/{solver_debug.cpp => solver_progress.cpp} (69%) rename include/ginkgo/core/log/{solver_debug.hpp => solver_progress.hpp} (84%) diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 56d35e8edf0..df8f748b4d3 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -62,7 +62,7 @@ target_sources(${ginkgo_core} log/tau.cpp log/vtune.cpp log/record.cpp - log/solver_debug.cpp + log/solver_progress.cpp log/stream.cpp matrix/batch_csr.cpp matrix/batch_dense.cpp diff --git a/core/log/solver_debug.cpp b/core/log/solver_progress.cpp similarity index 91% rename from core/log/solver_debug.cpp rename to core/log/solver_progress.cpp index b93a7987326..effa0279bba 100644 --- a/core/log/solver_debug.cpp +++ b/core/log/solver_progress.cpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "ginkgo/core/log/solver_progress.hpp" + #include #include #include @@ -12,7 +14,6 @@ #include #include #include -#include #include #include @@ -33,8 +34,8 @@ bool is_dense(const LinOp* value) } -class SolverDebugPrint : public SolverDebug { - friend class SolverDebug; +class SolverProgressPrint : public SolverProgress { + friend class SolverProgress; public: /* Internal solver events */ @@ -153,8 +154,8 @@ class SolverDebugPrint : public SolverDebug { } } - SolverDebugPrint(std::ostream& output, int precision, int column_width, - char separator) + SolverProgressPrint(std::ostream& output, int precision, int column_width, + char separator) : output_{&output}, precision_{precision}, column_width_{column_width}, @@ -170,8 +171,8 @@ class SolverDebugPrint : public SolverDebug { }; -class SolverDebugStore : public SolverDebug { - friend class SolverDebug; +class SolverProgressStore : public SolverProgress { + friend class SolverProgress; public: /* Internal solver events */ @@ -273,7 +274,7 @@ class SolverDebugStore : public SolverDebug { store_vector(value, std::to_string(iteration) + "_" + name); } - SolverDebugStore(std::string output_file_prefix, bool binary) + SolverProgressStore(std::string output_file_prefix, bool binary) : output_file_prefix_{std::move(output_file_prefix)}, binary_{binary} {} @@ -285,27 +286,27 @@ class SolverDebugStore : public SolverDebug { } // namespace -std::shared_ptr SolverDebug::create_scalar_table_writer( +std::shared_ptr SolverProgress::create_scalar_table_writer( std::ostream& output, int precision, int column_width) { - return std::shared_ptr{ - new SolverDebugPrint{output, precision, column_width, '\0'}}; + return std::shared_ptr{ + new SolverProgressPrint{output, precision, column_width, '\0'}}; } -std::shared_ptr SolverDebug::create_scalar_csv_writer( +std::shared_ptr SolverProgress::create_scalar_csv_writer( std::ostream& output, int precision, char separator) { - return std::shared_ptr{ - new SolverDebugPrint{output, precision, 0, separator}}; + return std::shared_ptr{ + new SolverProgressPrint{output, precision, 0, separator}}; } -std::shared_ptr SolverDebug::create_vector_storage( +std::shared_ptr SolverProgress::create_vector_storage( std::string output_file_prefix, bool binary) { - return std::shared_ptr{ - new SolverDebugStore{output_file_prefix, binary}}; + return std::shared_ptr{ + new SolverProgressStore{output_file_prefix, binary}}; } diff --git a/core/test/log/CMakeLists.txt b/core/test/log/CMakeLists.txt index 1231b996f5a..6e8c89ef671 100644 --- a/core/test/log/CMakeLists.txt +++ b/core/test/log/CMakeLists.txt @@ -6,5 +6,5 @@ endif() ginkgo_create_test(performance_hint) ginkgo_create_test(profiler_hook) ginkgo_create_test(record) -ginkgo_create_test(solver_debug) +ginkgo_create_test(solver_progress) ginkgo_create_test(stream) diff --git a/core/test/log/solver_debug.cpp b/core/test/log/solver_progress.cpp similarity index 69% rename from core/test/log/solver_debug.cpp rename to core/test/log/solver_progress.cpp index 90108116374..f2433779864 100644 --- a/core/test/log/solver_debug.cpp +++ b/core/test/log/solver_progress.cpp @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include @@ -16,12 +16,12 @@ template -class SolverDebug : public ::testing::Test { +class SolverProgress : public ::testing::Test { public: using Dense = gko::matrix::Dense; using Cg = gko::solver::Cg; - SolverDebug() : ref{gko::ReferenceExecutor::create()} + SolverProgress() : ref{gko::ReferenceExecutor::create()} { mtx = gko::initialize({T{1.0}}, ref); in = gko::initialize({T{2.0}}, ref); @@ -67,10 +67,10 @@ class SolverDebug : public ::testing::Test { std::unique_ptr solver; }; -TYPED_TEST_SUITE(SolverDebug, gko::test::ValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(SolverProgress, gko::test::ValueTypes, TypenameNameGenerator); -TYPED_TEST(SolverDebug, TableWorks) +TYPED_TEST(SolverProgress, TableWorks) { using T = TypeParam; std::stringstream ref_ss; @@ -97,7 +97,7 @@ TYPED_TEST(SolverDebug, TableWorks) << std::setw(default_column_width) << T{0.0} << '\n'; std::stringstream ss; this->solver->add_logger( - gko::log::SolverDebug::create_scalar_table_writer(ss)); + gko::log::SolverProgress::create_scalar_table_writer(ss)); this->solver->apply(this->in, this->out); @@ -110,7 +110,7 @@ TYPED_TEST(SolverDebug, TableWorks) } -TYPED_TEST(SolverDebug, CsvWorks) +TYPED_TEST(SolverProgress, CsvWorks) { using T = TypeParam; std::stringstream ref_ss; @@ -125,7 +125,7 @@ TYPED_TEST(SolverDebug, CsvWorks) << T{0.0} << '\n'; std::stringstream ss; this->solver->add_logger( - gko::log::SolverDebug::create_scalar_csv_writer(ss, 6, ';')); + gko::log::SolverProgress::create_scalar_csv_writer(ss, 6, ';')); this->solver->apply(this->in, this->out); @@ -137,44 +137,44 @@ TYPED_TEST(SolverDebug, CsvWorks) } -TYPED_TEST(SolverDebug, StorageWorks) +TYPED_TEST(SolverProgress, StorageWorks) { using T = TypeParam; using Dense = typename TestFixture::Dense; auto orig_out = this->out->clone(); auto init_residual = gko::initialize({T{-2.0}}, this->ref); std::vector> files{ - {"solver_debug_test_0_beta", nullptr}, - {"solver_debug_test_0_implicit_sq_residual_norm", orig_out.get()}, - {"solver_debug_test_0_minus_one", nullptr}, - {"solver_debug_test_0_one", nullptr}, - {"solver_debug_test_0_p", nullptr}, - {"solver_debug_test_0_prev_rho", nullptr}, - {"solver_debug_test_0_q", nullptr}, - {"solver_debug_test_0_r", nullptr}, - {"solver_debug_test_0_residual", init_residual.get()}, - {"solver_debug_test_0_rho", nullptr}, - {"solver_debug_test_0_solution", orig_out.get()}, - {"solver_debug_test_0_z", nullptr}, - {"solver_debug_test_1_beta", orig_out.get()}, - {"solver_debug_test_1_implicit_sq_residual_norm", this->zero.get()}, - {"solver_debug_test_1_minus_one", nullptr}, - {"solver_debug_test_1_one", nullptr}, - {"solver_debug_test_1_p", nullptr}, - {"solver_debug_test_1_prev_rho", nullptr}, - {"solver_debug_test_1_q", nullptr}, - {"solver_debug_test_1_r", nullptr}, - {"solver_debug_test_1_residual", this->zero.get()}, - {"solver_debug_test_1_rho", nullptr}, - {"solver_debug_test_1_solution", this->in.get()}, - {"solver_debug_test_1_z", nullptr}, - {"solver_debug_test_initial_guess", orig_out.get()}, - {"solver_debug_test_rhs", this->in.get()}, - {"solver_debug_test_system_matrix", this->mtx.get()}}; - this->solver->add_logger(gko::log::SolverDebug::create_vector_storage( - "solver_debug_test", false)); - this->solver->add_logger(gko::log::SolverDebug::create_vector_storage( - "solver_debug_test", true)); + {"solver_progress_test_0_beta", nullptr}, + {"solver_progress_test_0_implicit_sq_residual_norm", orig_out.get()}, + {"solver_progress_test_0_minus_one", nullptr}, + {"solver_progress_test_0_one", nullptr}, + {"solver_progress_test_0_p", nullptr}, + {"solver_progress_test_0_prev_rho", nullptr}, + {"solver_progress_test_0_q", nullptr}, + {"solver_progress_test_0_r", nullptr}, + {"solver_progress_test_0_residual", init_residual.get()}, + {"solver_progress_test_0_rho", nullptr}, + {"solver_progress_test_0_solution", orig_out.get()}, + {"solver_progress_test_0_z", nullptr}, + {"solver_progress_test_1_beta", orig_out.get()}, + {"solver_progress_test_1_implicit_sq_residual_norm", this->zero.get()}, + {"solver_progress_test_1_minus_one", nullptr}, + {"solver_progress_test_1_one", nullptr}, + {"solver_progress_test_1_p", nullptr}, + {"solver_progress_test_1_prev_rho", nullptr}, + {"solver_progress_test_1_q", nullptr}, + {"solver_progress_test_1_r", nullptr}, + {"solver_progress_test_1_residual", this->zero.get()}, + {"solver_progress_test_1_rho", nullptr}, + {"solver_progress_test_1_solution", this->in.get()}, + {"solver_progress_test_1_z", nullptr}, + {"solver_progress_test_initial_guess", orig_out.get()}, + {"solver_progress_test_rhs", this->in.get()}, + {"solver_progress_test_system_matrix", this->mtx.get()}}; + this->solver->add_logger(gko::log::SolverProgress::create_vector_storage( + "solver_progress_test", false)); + this->solver->add_logger(gko::log::SolverProgress::create_vector_storage( + "solver_progress_test", true)); this->solver->apply(this->in, this->out); diff --git a/include/ginkgo/core/log/solver_debug.hpp b/include/ginkgo/core/log/solver_progress.hpp similarity index 84% rename from include/ginkgo/core/log/solver_debug.hpp rename to include/ginkgo/core/log/solver_progress.hpp index 98db712cc44..71e08fc96c9 100644 --- a/include/ginkgo/core/log/solver_debug.hpp +++ b/include/ginkgo/core/log/solver_progress.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_PUBLIC_CORE_LOG_SOLVER_DEBUG_HPP_ -#define GKO_PUBLIC_CORE_LOG_SOLVER_DEBUG_HPP_ +#ifndef GKO_PUBLIC_CORE_LOG_SOLVER_PROGRESS_HPP_ +#define GKO_PUBLIC_CORE_LOG_SOLVER_PROGRESS_HPP_ #include @@ -19,9 +19,9 @@ namespace log { /** * This Logger outputs the value of all scalar values (and potentially vectors) * stored internally by the solver after each iteration. It needs to be attached - * to the solver being debugged. + * to the solver being inspected. */ -class SolverDebug : public Logger { +class SolverProgress : public Logger { public: /** * Creates a logger printing the value for all scalar values in the solver @@ -33,7 +33,7 @@ class SolverDebug : public Logger { * @param precision the number of digits of precision to print * @param column_width the number of characters an output column is wide */ - static std::shared_ptr create_scalar_table_writer( + static std::shared_ptr create_scalar_table_writer( std::ostream& output, int precision = 6, int column_width = 12); @@ -47,7 +47,7 @@ class SolverDebug : public Logger { * @param precision the number of digits of precision to print * @param separator the character separating columns from each other */ - static std::shared_ptr create_scalar_csv_writer( + static std::shared_ptr create_scalar_csv_writer( std::ostream& output, int precision = 6, char separator = ','); @@ -65,7 +65,7 @@ class SolverDebug : public Logger { * (lossless), if false write data in the MatrixMarket format * (potentially lossy) */ - static std::shared_ptr create_vector_storage( + static std::shared_ptr create_vector_storage( std::string output_file_prefix = "solver_", bool binary = false); }; @@ -74,4 +74,4 @@ class SolverDebug : public Logger { } // namespace gko -#endif // GKO_PUBLIC_CORE_LOG_SOLVER_DEBUG_HPP_ +#endif // GKO_PUBLIC_CORE_LOG_SOLVER_PROGRESS_HPP_ diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index 2e307792c85..0fab93dcefe 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -87,7 +87,7 @@ #include #include #include -#include +#include #include #include From c3fb39810d5cbcf98390429a23f7102879ef63c8 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 4 Jul 2024 11:06:29 +0200 Subject: [PATCH 064/448] add the failing test that the transposed trs uses different alg --- reference/test/solver/lower_trs_kernels.cpp | 69 ++++++++++++++++++++- reference/test/solver/upper_trs_kernels.cpp | 69 ++++++++++++++++++++- 2 files changed, 134 insertions(+), 4 deletions(-) diff --git a/reference/test/solver/lower_trs_kernels.cpp b/reference/test/solver/lower_trs_kernels.cpp index da2e55700f5..3680f19681f 100644 --- a/reference/test/solver/lower_trs_kernels.cpp +++ b/reference/test/solver/lower_trs_kernels.cpp @@ -54,6 +54,10 @@ class LowerTrs : public ::testing::Test { {365.0, 97.0, -654.0, 8.0, 91.0}}, exec)), lower_trs_factory(Solver::build().on(exec)), + lower_trs_syncfree_factory( + Solver::build() + .with_algorithm(gko::solver::trisolve_algorithm::syncfree) + .on(exec)), lower_trs_factory_mrhs(Solver::build().with_num_rhs(2u).on(exec)), lower_trs_factory_unit( Solver::build().with_unit_diagonal(true).on(exec)) @@ -66,6 +70,7 @@ class LowerTrs : public ::testing::Test { std::shared_ptr mtx_big_lower; std::shared_ptr mtx_big_general; std::unique_ptr lower_trs_factory; + std::unique_ptr lower_trs_syncfree_factory; std::unique_ptr lower_trs_factory_mrhs; std::unique_ptr lower_trs_factory_unit; }; @@ -348,13 +353,21 @@ TYPED_TEST(LowerTrs, SolvesTransposedTriangularSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + using Solver = typename TestFixture::Solver; std::shared_ptr b = gko::initialize({1.0, 2.0, 1.0}, this->exec); auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); auto solver = this->lower_trs_factory->generate(this->mtx); + auto transposed_solver = + gko::as(solver->transpose()); - solver->transpose()->apply(b, x); + transposed_solver->apply(b, x); GKO_ASSERT_MTX_NEAR(x, l({0.0, 0.0, 1.0}), r::value); + // Ensure that the other test with syncfree is not the default option + ASSERT_EQ(solver->get_parameters().algorithm, + gko::solver::trisolve_algorithm::sparselib); + ASSERT_EQ(transposed_solver->get_parameters().algorithm, + solver->get_parameters().algorithm); } @@ -362,13 +375,65 @@ TYPED_TEST(LowerTrs, SolvesConjTransposedTriangularSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + using Solver = typename TestFixture::Solver; std::shared_ptr b = gko::initialize({1.0, 2.0, 1.0}, this->exec); auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); auto solver = this->lower_trs_factory->generate(this->mtx); + auto conj_transposed_solver = + gko::as(solver->conj_transpose()); - solver->conj_transpose()->apply(b, x); + conj_transposed_solver->apply(b, x); GKO_ASSERT_MTX_NEAR(x, l({0.0, 0.0, 1.0}), r::value); + // Ensure that the other test with syncfree is not the default option + ASSERT_EQ(solver->get_parameters().algorithm, + gko::solver::trisolve_algorithm::sparselib); + ASSERT_EQ(conj_transposed_solver->get_parameters().algorithm, + solver->get_parameters().algorithm); +} + + +TYPED_TEST(LowerTrs, SolvesTransposedTriangularSystemWithSyncFree) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using Solver = typename TestFixture::Solver; + std::shared_ptr b = gko::initialize({1.0, 2.0, 1.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + auto solver = this->lower_trs_syncfree_factory->generate(this->mtx); + auto transposed_solver = + gko::as(solver->transpose()); + + transposed_solver->apply(b, x); + + GKO_ASSERT_MTX_NEAR(x, l({0.0, 0.0, 1.0}), r::value); + // Ensure that this test uses syncfree + ASSERT_EQ(solver->get_parameters().algorithm, + gko::solver::trisolve_algorithm::syncfree); + ASSERT_EQ(transposed_solver->get_parameters().algorithm, + solver->get_parameters().algorithm); +} + + +TYPED_TEST(LowerTrs, SolvesConjTransposedTriangularSystemWithSyncFree) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using Solver = typename TestFixture::Solver; + std::shared_ptr b = gko::initialize({1.0, 2.0, 1.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + auto solver = this->lower_trs_syncfree_factory->generate(this->mtx); + auto conj_transposed_solver = + gko::as(solver->conj_transpose()); + + conj_transposed_solver->apply(b, x); + + GKO_ASSERT_MTX_NEAR(x, l({0.0, 0.0, 1.0}), r::value); + // Ensure that this test uses syncfree + ASSERT_EQ(solver->get_parameters().algorithm, + gko::solver::trisolve_algorithm::syncfree); + ASSERT_EQ(conj_transposed_solver->get_parameters().algorithm, + solver->get_parameters().algorithm); } diff --git a/reference/test/solver/upper_trs_kernels.cpp b/reference/test/solver/upper_trs_kernels.cpp index dc964e6b83d..a60f3b46079 100644 --- a/reference/test/solver/upper_trs_kernels.cpp +++ b/reference/test/solver/upper_trs_kernels.cpp @@ -54,6 +54,10 @@ class UpperTrs : public ::testing::Test { {0.0, 2.0, 0.0, 4.0, 124.0}}, exec)), upper_trs_factory(Solver::build().on(exec)), + upper_trs_syncfree_factory( + Solver::build() + .with_algorithm(gko::solver::trisolve_algorithm::syncfree) + .on(exec)), upper_trs_factory_mrhs(Solver::build().with_num_rhs(2u).on(exec)), upper_trs_factory_unit( Solver::build().with_unit_diagonal(true).on(exec)) @@ -66,6 +70,7 @@ class UpperTrs : public ::testing::Test { std::shared_ptr mtx_big_upper; std::shared_ptr mtx_big_general; std::unique_ptr upper_trs_factory; + std::unique_ptr upper_trs_syncfree_factory; std::unique_ptr upper_trs_factory_mrhs; std::unique_ptr upper_trs_factory_unit; }; @@ -349,13 +354,21 @@ TYPED_TEST(UpperTrs, SolvesTransposedTriangularSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + using Solver = typename TestFixture::Solver; std::shared_ptr b = gko::initialize({4.0, 2.0, 3.0}, this->exec); auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); auto solver = this->upper_trs_factory->generate(this->mtx); + auto transposed_solver = + gko::as(solver->transpose()); - solver->transpose()->apply(b, x); + transposed_solver->apply(b, x); GKO_ASSERT_MTX_NEAR(x, l({4.0, -10.0, 19.0}), r::value); + // Ensure that the other test with syncfree is not the default option + ASSERT_EQ(solver->get_parameters().algorithm, + gko::solver::trisolve_algorithm::sparselib); + ASSERT_EQ(transposed_solver->get_parameters().algorithm, + solver->get_parameters().algorithm); } @@ -363,13 +376,65 @@ TYPED_TEST(UpperTrs, SolvesConjTransposedTriangularSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + using Solver = typename TestFixture::Solver; std::shared_ptr b = gko::initialize({4.0, 2.0, 3.0}, this->exec); auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); auto solver = this->upper_trs_factory->generate(this->mtx); + auto conj_transposed_solver = + gko::as(solver->conj_transpose()); - solver->conj_transpose()->apply(b, x); + conj_transposed_solver->apply(b, x); GKO_ASSERT_MTX_NEAR(x, l({4.0, -10.0, 19.0}), r::value); + // Ensure that the other test with syncfree is not the default option + ASSERT_EQ(solver->get_parameters().algorithm, + gko::solver::trisolve_algorithm::sparselib); + ASSERT_EQ(conj_transposed_solver->get_parameters().algorithm, + solver->get_parameters().algorithm); +} + + +TYPED_TEST(UpperTrs, SolvesTransposedTriangularSystemWithSyncFree) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using Solver = typename TestFixture::Solver; + std::shared_ptr b = gko::initialize({4.0, 2.0, 3.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + auto solver = this->upper_trs_syncfree_factory->generate(this->mtx); + auto transposed_solver = + gko::as(solver->transpose()); + + transposed_solver->apply(b, x); + + GKO_ASSERT_MTX_NEAR(x, l({4.0, -10.0, 19.0}), r::value); + // Ensure that this test uses syncfree + ASSERT_EQ(solver->get_parameters().algorithm, + gko::solver::trisolve_algorithm::syncfree); + ASSERT_EQ(transposed_solver->get_parameters().algorithm, + solver->get_parameters().algorithm); +} + + +TYPED_TEST(UpperTrs, SolvesConjTransposedTriangularSystemWithSyncFree) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename TestFixture::value_type; + using Solver = typename TestFixture::Solver; + std::shared_ptr b = gko::initialize({4.0, 2.0, 3.0}, this->exec); + auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); + auto solver = this->upper_trs_syncfree_factory->generate(this->mtx); + auto conj_transposed_solver = + gko::as(solver->conj_transpose()); + + conj_transposed_solver->apply(b, x); + + GKO_ASSERT_MTX_NEAR(x, l({4.0, -10.0, 19.0}), r::value); + // Ensure that this test uses syncfree + ASSERT_EQ(solver->get_parameters().algorithm, + gko::solver::trisolve_algorithm::syncfree); + ASSERT_EQ(conj_transposed_solver->get_parameters().algorithm, + solver->get_parameters().algorithm); } From b6a7f61ddcf1bbef98bfb8ddff7039a7ca893ac9 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 4 Jul 2024 11:19:28 +0200 Subject: [PATCH 065/448] fix: transposed trs uses the same alg --- core/solver/lower_trs.cpp | 2 ++ core/solver/upper_trs.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/core/solver/lower_trs.cpp b/core/solver/lower_trs.cpp index e8230625ab3..3048c877dbd 100644 --- a/core/solver/lower_trs.cpp +++ b/core/solver/lower_trs.cpp @@ -99,6 +99,7 @@ std::unique_ptr LowerTrs::transpose() const { return transposed_type::build() .with_num_rhs(this->parameters_.num_rhs) + .with_algorithm(this->parameters_.algorithm) .on(this->get_executor()) ->generate(share(this->get_system_matrix()->transpose())); } @@ -109,6 +110,7 @@ std::unique_ptr LowerTrs::conj_transpose() const { return transposed_type::build() .with_num_rhs(this->parameters_.num_rhs) + .with_algorithm(this->parameters_.algorithm) .on(this->get_executor()) ->generate(share(this->get_system_matrix()->conj_transpose())); } diff --git a/core/solver/upper_trs.cpp b/core/solver/upper_trs.cpp index be6fcc71275..c759c119647 100644 --- a/core/solver/upper_trs.cpp +++ b/core/solver/upper_trs.cpp @@ -99,6 +99,7 @@ std::unique_ptr UpperTrs::transpose() const { return transposed_type::build() .with_num_rhs(this->parameters_.num_rhs) + .with_algorithm(this->parameters_.algorithm) .on(this->get_executor()) ->generate(share(this->get_system_matrix()->transpose())); } @@ -109,6 +110,7 @@ std::unique_ptr UpperTrs::conj_transpose() const { return transposed_type::build() .with_num_rhs(this->parameters_.num_rhs) + .with_algorithm(this->parameters_.algorithm) .on(this->get_executor()) ->generate(share(this->get_system_matrix()->conj_transpose())); } From f0a975bba8fe1369e211ddff6fa7d28015b173a6 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 4 Jul 2024 13:18:04 +0200 Subject: [PATCH 066/448] add the test for inconsistent behavior on diag zero --- .../test/preconditioner/jacobi_kernels.dp.cpp | 23 +++++++++++++++++++ reference/test/preconditioner/jacobi.cpp | 20 ++++++++++++++++ test/preconditioner/jacobi_kernels.cpp | 23 +++++++++++++++++++ 3 files changed, 66 insertions(+) diff --git a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp index 6dcfe460c71..ebc92fcb4d3 100644 --- a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp @@ -905,4 +905,27 @@ TEST_F( } +TEST_F(Jacobi, ScalarJacobiHandleZero) +{ + auto mtx = gko::share( + gko::initialize({{0, 0, 0}, {0, 2, 0}, {0, 0, 0}}, ref)); + auto b = gko::initialize({1, 2, 3}, ref); + auto x = Vec::create(ref, gko::dim<2>(3, 1)); + auto jacobi = Bj::build().with_max_block_size(1u).on(ref)->generate(mtx); + auto d_b = b->clone(dpcpp); + auto d_x = x->clone(dpcpp); + auto d_mtx = gko::share(mtx->clone(dpcpp)); + // Must generate from scratch because the clone copies the inverted + // information. + auto d_jacobi = + Bj::build().with_max_block_size(1u).on(dpcpp)->generate(d_mtx); + + // Jacobi uses 1 as the result when diagonal value is zero. + jacobi->apply(b, x); + d_jacobi->apply(d_b, d_x); + + GKO_ASSERT_MTX_NEAR(d_x, x, 0.0); +} + + } // namespace diff --git a/reference/test/preconditioner/jacobi.cpp b/reference/test/preconditioner/jacobi.cpp index 801250a9826..4dd9b5bcb31 100644 --- a/reference/test/preconditioner/jacobi.cpp +++ b/reference/test/preconditioner/jacobi.cpp @@ -494,4 +494,24 @@ TYPED_TEST(Jacobi, ScalarJacobiGeneratesOnDifferentPrecision) } +TYPED_TEST(Jacobi, ScalarJacobiHandleZero) +{ + using value_type = typename TestFixture::value_type; + using Vec = typename TestFixture::Vec; + using Bj = typename TestFixture::Bj; + auto mtx = gko::share( + gko::initialize({{0, 0, 0}, {0, 2, 0}, {0, 0, 0}}, this->exec)); + auto b = gko::initialize({1, 2, 3}, this->exec); + auto x = Vec::create(this->exec, gko::dim<2>(3, 1)); + auto jacobi = this->scalar_j_factory->generate(mtx); + + // Jacobi uses 1 as the result when diagonal value is zero. + jacobi->apply(b, x); + + ASSERT_EQ(x->at(0, 0), value_type{1.0}); + ASSERT_EQ(x->at(1, 0), value_type{1.0}); + ASSERT_EQ(x->at(2, 0), value_type{3.0}); +} + + } // namespace diff --git a/test/preconditioner/jacobi_kernels.cpp b/test/preconditioner/jacobi_kernels.cpp index 23347d8d896..bc50519f7d7 100644 --- a/test/preconditioner/jacobi_kernels.cpp +++ b/test/preconditioner/jacobi_kernels.cpp @@ -887,3 +887,26 @@ TEST_F( GKO_ASSERT_MTX_NEAR(d_x, x, 1e-6); } + + +TEST_F(Jacobi, ScalarJacobiHandleZero) +{ + auto mtx = gko::share( + gko::initialize({{0, 0, 0}, {0, 2, 0}, {0, 0, 0}}, ref)); + auto b = gko::initialize({1, 2, 3}, ref); + auto x = Vec::create(ref, gko::dim<2>(3, 1)); + auto jacobi = Bj::build().with_max_block_size(1u).on(ref)->generate(mtx); + auto d_b = b->clone(exec); + auto d_x = x->clone(exec); + auto d_mtx = gko::share(mtx->clone(exec)); + // Must generate from scratch because the clone copies the inverted + // information. + auto d_jacobi = + Bj::build().with_max_block_size(1u).on(exec)->generate(d_mtx); + + // Jacobi uses 1 as the result when diagonal value is zero. + jacobi->apply(b, x); + d_jacobi->apply(d_b, d_x); + + GKO_ASSERT_MTX_NEAR(d_x, x, 0.0); +} From ae21bd8c2d5c1d73fe5372b09e0d03d3d159cb70 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 4 Jul 2024 16:32:42 +0200 Subject: [PATCH 067/448] fix the backend version on the diag zero --- common/unified/preconditioner/jacobi_kernels.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/common/unified/preconditioner/jacobi_kernels.cpp b/common/unified/preconditioner/jacobi_kernels.cpp index b8c19c24f79..dce00fd1366 100644 --- a/common/unified/preconditioner/jacobi_kernels.cpp +++ b/common/unified/preconditioner/jacobi_kernels.cpp @@ -42,7 +42,9 @@ void invert_diagonal(std::shared_ptr exec, run_kernel( exec, [] GKO_KERNEL(auto elem, auto diag, auto inv_diag) { - inv_diag[elem] = safe_divide(one(diag[elem]), diag[elem]); + // if the diagonal is zero, we use 1 for in the inverted result. + inv_diag[elem] = is_zero(diag[elem]) ? one(diag[elem]) + : one(diag[elem]) / diag[elem]; }, diag.get_size(), diag, inv_diag); } From 4740e180a0261912a5cc682a4a50d9f591ff577d Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Sat, 6 Jul 2024 09:17:01 +0200 Subject: [PATCH 068/448] AAA rules in test Co-authored-by: Marcel Koch --- dpcpp/test/preconditioner/jacobi_kernels.dp.cpp | 4 ++-- reference/test/preconditioner/jacobi.cpp | 2 +- test/preconditioner/jacobi_kernels.cpp | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp index ebc92fcb4d3..b8950ed2d2a 100644 --- a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp @@ -911,10 +911,11 @@ TEST_F(Jacobi, ScalarJacobiHandleZero) gko::initialize({{0, 0, 0}, {0, 2, 0}, {0, 0, 0}}, ref)); auto b = gko::initialize({1, 2, 3}, ref); auto x = Vec::create(ref, gko::dim<2>(3, 1)); - auto jacobi = Bj::build().with_max_block_size(1u).on(ref)->generate(mtx); auto d_b = b->clone(dpcpp); auto d_x = x->clone(dpcpp); auto d_mtx = gko::share(mtx->clone(dpcpp)); + + auto jacobi = Bj::build().with_max_block_size(1u).on(ref)->generate(mtx); // Must generate from scratch because the clone copies the inverted // information. auto d_jacobi = @@ -923,7 +924,6 @@ TEST_F(Jacobi, ScalarJacobiHandleZero) // Jacobi uses 1 as the result when diagonal value is zero. jacobi->apply(b, x); d_jacobi->apply(d_b, d_x); - GKO_ASSERT_MTX_NEAR(d_x, x, 0.0); } diff --git a/reference/test/preconditioner/jacobi.cpp b/reference/test/preconditioner/jacobi.cpp index 4dd9b5bcb31..79c276579ad 100644 --- a/reference/test/preconditioner/jacobi.cpp +++ b/reference/test/preconditioner/jacobi.cpp @@ -503,11 +503,11 @@ TYPED_TEST(Jacobi, ScalarJacobiHandleZero) gko::initialize({{0, 0, 0}, {0, 2, 0}, {0, 0, 0}}, this->exec)); auto b = gko::initialize({1, 2, 3}, this->exec); auto x = Vec::create(this->exec, gko::dim<2>(3, 1)); + auto jacobi = this->scalar_j_factory->generate(mtx); // Jacobi uses 1 as the result when diagonal value is zero. jacobi->apply(b, x); - ASSERT_EQ(x->at(0, 0), value_type{1.0}); ASSERT_EQ(x->at(1, 0), value_type{1.0}); ASSERT_EQ(x->at(2, 0), value_type{3.0}); diff --git a/test/preconditioner/jacobi_kernels.cpp b/test/preconditioner/jacobi_kernels.cpp index bc50519f7d7..8bfd8ace57d 100644 --- a/test/preconditioner/jacobi_kernels.cpp +++ b/test/preconditioner/jacobi_kernels.cpp @@ -895,10 +895,11 @@ TEST_F(Jacobi, ScalarJacobiHandleZero) gko::initialize({{0, 0, 0}, {0, 2, 0}, {0, 0, 0}}, ref)); auto b = gko::initialize({1, 2, 3}, ref); auto x = Vec::create(ref, gko::dim<2>(3, 1)); - auto jacobi = Bj::build().with_max_block_size(1u).on(ref)->generate(mtx); auto d_b = b->clone(exec); auto d_x = x->clone(exec); auto d_mtx = gko::share(mtx->clone(exec)); + + auto jacobi = Bj::build().with_max_block_size(1u).on(ref)->generate(mtx); // Must generate from scratch because the clone copies the inverted // information. auto d_jacobi = @@ -907,6 +908,5 @@ TEST_F(Jacobi, ScalarJacobiHandleZero) // Jacobi uses 1 as the result when diagonal value is zero. jacobi->apply(b, x); d_jacobi->apply(d_b, d_x); - GKO_ASSERT_MTX_NEAR(d_x, x, 0.0); } From c4e02a16588e90965518a21ff1dfbbc8ce43bf72 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 1 May 2024 15:28:21 +0200 Subject: [PATCH 069/448] require C++17 support --- README.md | 10 +++++----- accessor/utils.hpp | 12 +++++++----- cmake/build_helpers.cmake | 6 +++--- cmake/create_test.cmake | 4 ++-- cmake/hip.cmake | 2 +- common/cuda_hip/components/reduction.hpp | 2 +- common/cuda_hip/reorder/rcm_kernels.cpp | 1 - core/base/extended_float.hpp | 1 - core/solver/cb_gmres_accessor.hpp | 1 - core/test/accessor/CMakeLists.txt | 2 +- core/test/base/deferred_factory.cpp | 6 +++--- cuda/base/pointer_mode_guard.hpp | 9 ++++++--- dpcpp/reorder/rcm_kernels.dp.cpp | 1 - examples/custom-matrix-format/CMakeLists.txt | 2 +- hip/base/pointer_mode_guard.hip.hpp | 9 ++++++--- include/ginkgo/core/base/abstract_factory.hpp | 3 ++- include/ginkgo/core/base/combination.hpp | 3 ++- include/ginkgo/core/base/composition.hpp | 3 ++- include/ginkgo/core/base/math.hpp | 17 ++++++++--------- include/ginkgo/core/base/utils_helper.hpp | 5 ++--- include/ginkgo/core/distributed/matrix.hpp | 2 +- include/ginkgo/core/log/logger.hpp | 2 +- include/ginkgo/core/preconditioner/ic.hpp | 1 - include/ginkgo/core/preconditioner/ilu.hpp | 1 - include/ginkgo/core/solver/solver_traits.hpp | 12 +++++++----- test/test_install/CMakeLists.txt | 4 ++-- 26 files changed, 63 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index 0328ac43415..d5e22bd0b35 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Ginkgo is a high-performance numerical linear algebra library for many-core systems, with a focus on solution of sparse linear systems. It is implemented using modern C++ -(you will need an at least C++14 compliant compiler to build it), with GPU kernels +(you will need an at least C++17 compliant compiler to build it), with GPU kernels implemented for NVIDIA, AMD and Intel GPUs. --- @@ -39,7 +39,7 @@ implemented for NVIDIA, AMD and Intel GPUs. For Ginkgo core library: * _cmake 3.16+_ -* C++14 compliant compiler, one of: +* C++17 compliant compiler, one of: * _gcc 5.5+_ * _clang 3.9+_ * _Intel compiler 2019+_ @@ -50,7 +50,7 @@ For Ginkgo core library: The Ginkgo CUDA module has the following __additional__ requirements: * _cmake 3.18+_ (If CUDA was installed through the NVIDIA HPC Toolkit, we require _cmake 3.22+_) -* _CUDA 10.1+_ or _NVHPC Package 22.7+_ +* _CUDA 11.0+_ or _NVHPC Package 22.7+_ * Any host compiler restrictions your version of CUDA may impose also apply here. For the newest CUDA version, this information can be found in the [CUDA installation guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) @@ -88,13 +88,13 @@ following: ### Windows * _cmake 3.16+_ -* C++14 compliant 64-bit compiler: +* C++17 compliant 64-bit compiler: * _MinGW : gcc 5.5+_ * _Microsoft Visual Studio : VS 2019+_ The Ginkgo CUDA module has the following __additional__ requirements: -* _CUDA 10.1+_ +* _CUDA 11.0+_ * _Microsoft Visual Studio_ * Any host compiler restrictions your version of CUDA may impose also apply here. For the newest CUDA version, this information can be found in the diff --git a/accessor/utils.hpp b/accessor/utils.hpp index 5dcd3e89fcd..14d1e26492a 100644 --- a/accessor/utils.hpp +++ b/accessor/utils.hpp @@ -5,9 +5,11 @@ #ifndef GKO_ACCESSOR_UTILS_HPP_ #define GKO_ACCESSOR_UTILS_HPP_ + #include #include #include +#include #if defined(__CUDACC__) || defined(__HIPCC__) @@ -144,7 +146,7 @@ namespace detail { * @internal * Tests if a member function `Ref::to_arithmetic_type` exists */ -template > +template > struct has_to_arithmetic_type : std::false_type { static_assert(std::is_same::value, "Do not modify the Dummy value!"); @@ -153,7 +155,7 @@ struct has_to_arithmetic_type : std::false_type { template struct has_to_arithmetic_type< - Ref, xstd::void_t().to_arithmetic_type())>> + Ref, std::void_t().to_arithmetic_type())>> : std::true_type { using type = decltype(std::declval().to_arithmetic_type()); }; @@ -163,14 +165,14 @@ struct has_to_arithmetic_type< * @internal * Tests if the type `Ref::arithmetic_type` exists */ -template > +template > struct has_arithmetic_type : std::false_type { static_assert(std::is_same::value, "Do not modify the Dummy value!"); }; template -struct has_arithmetic_type> +struct has_arithmetic_type> : std::true_type {}; @@ -236,7 +238,7 @@ struct has_implicit_cast { template struct has_implicit_cast( + std::void_t( std::declval()))>> { static constexpr bool value = true; }; diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake index e8691b77587..f0337839a55 100644 --- a/cmake/build_helpers.cmake +++ b/cmake/build_helpers.cmake @@ -18,11 +18,11 @@ function(ginkgo_default_includes name) endfunction() function(ginkgo_compile_features name) - target_compile_features("${name}" PUBLIC cxx_std_14) + target_compile_features("${name}" PUBLIC cxx_std_17) # we set these properties regardless of the enabled backends, # because unknown properties are ignored - set_target_properties("${name}" PROPERTIES HIP_STANDARD 14) - set_target_properties("${name}" PROPERTIES CUDA_STANDARD 14) + set_target_properties("${name}" PROPERTIES HIP_STANDARD 17) + set_target_properties("${name}" PROPERTIES CUDA_STANDARD 17) if(GINKGO_WITH_CLANG_TIDY AND GINKGO_CLANG_TIDY_PATH) set_property(TARGET "${name}" PROPERTY CXX_CLANG_TIDY "${GINKGO_CLANG_TIDY_PATH};-checks=*") endif() diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 9f7079f60a3..68f5708e829 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -37,8 +37,8 @@ function(ginkgo_set_test_target_properties test_target_name test_library_suffix) target_compile_features(${test_target_name} PUBLIC cxx_std_14) # we set these properties regardless of the enabled backends, # because unknown properties are ignored - set_target_properties(${test_target_name} PROPERTIES HIP_STANDARD 14) - set_target_properties(${test_target_name} PROPERTIES CUDA_STANDARD 14) + set_target_properties(${test_target_name} PROPERTIES HIP_STANDARD 17) + set_target_properties(${test_target_name} PROPERTIES CUDA_STANDARD 17) target_include_directories(${test_target_name} PRIVATE ${Ginkgo_BINARY_DIR} ${set_properties_ADDITIONAL_INCLUDES}) target_link_libraries(${test_target_name} PRIVATE ginkgo GTest::GTest ${set_properties_ADDITIONAL_LIBRARIES}) endfunction() diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 327375bfe76..c94117242eb 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -128,4 +128,4 @@ find_package(ROCTX) if(GINKGO_HIP_AMD_UNSAFE_ATOMIC AND GINKGO_HIP_VERSION VERSION_GREATER_EQUAL 5) set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -munsafe-fp-atomics -Wno-unused-command-line-argument") endif() -set(CMAKE_HIP_STANDARD 14) +set(CMAKE_HIP_STANDARD 17) diff --git a/common/cuda_hip/components/reduction.hpp b/common/cuda_hip/components/reduction.hpp index 1968a6d30b6..7c66befa6bd 100644 --- a/common/cuda_hip/components/reduction.hpp +++ b/common/cuda_hip/components/reduction.hpp @@ -141,7 +141,7 @@ __device__ void reduce(const Group& __restrict__ group, */ template < typename Group, typename ValueType, typename Operator, - typename = xstd::enable_if_t::value>> + typename = std::enable_if_t::value>> __device__ void multireduce(const Group& __restrict__ group, ValueType* __restrict__ data, size_type stride, size_type num, Operator reduce_op = Operator{}) diff --git a/common/cuda_hip/reorder/rcm_kernels.cpp b/common/cuda_hip/reorder/rcm_kernels.cpp index 3206fb28c8b..72729db30f1 100644 --- a/common/cuda_hip/reorder/rcm_kernels.cpp +++ b/common/cuda_hip/reorder/rcm_kernels.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include #include diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index 2dc60afd329..c14b5d1bd39 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -9,7 +9,6 @@ #include #include -#include #include diff --git a/core/solver/cb_gmres_accessor.hpp b/core/solver/cb_gmres_accessor.hpp index 64a7c9a46e5..a5d95793d15 100644 --- a/core/solver/cb_gmres_accessor.hpp +++ b/core/solver/cb_gmres_accessor.hpp @@ -16,7 +16,6 @@ #include #include #include -#include #include #include "accessor/range.hpp" diff --git a/core/test/accessor/CMakeLists.txt b/core/test/accessor/CMakeLists.txt index 4fd0ff158d0..07da99cc308 100644 --- a/core/test/accessor/CMakeLists.txt +++ b/core/test/accessor/CMakeLists.txt @@ -6,7 +6,7 @@ function(create_accessor_test test_name) ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}") add_executable("${TEST_TARGET_NAME}" "${test_name}.cpp") - target_compile_features("${TEST_TARGET_NAME}" PUBLIC cxx_std_14) + target_compile_features("${TEST_TARGET_NAME}" PUBLIC cxx_std_17) target_include_directories("${TEST_TARGET_NAME}" PRIVATE "${Ginkgo_SOURCE_DIR}" diff --git a/core/test/base/deferred_factory.cpp b/core/test/base/deferred_factory.cpp index a1c02103cf8..4b140bfcbc6 100644 --- a/core/test/base/deferred_factory.cpp +++ b/core/test/base/deferred_factory.cpp @@ -80,12 +80,12 @@ struct test_impl : std::false_type {}; // specialization for constructor template -struct test_impl()...))>, T, - Args...> : std::true_type {}; +struct test_impl()...))>, T, Args...> + : std::true_type {}; // specialization for DF2 with_factory_list template -struct test_impl()...))>, DummyFlag, Args...> : std::true_type {}; diff --git a/cuda/base/pointer_mode_guard.hpp b/cuda/base/pointer_mode_guard.hpp index 39af6100c46..56f46fedf40 100644 --- a/cuda/base/pointer_mode_guard.hpp +++ b/cuda/base/pointer_mode_guard.hpp @@ -13,7 +13,6 @@ #include #include -#include namespace gko { @@ -35,6 +34,7 @@ class pointer_mode_guard { pointer_mode_guard(cublasHandle_t& handle) { l_handle = &handle; + uncaught_exceptions_ = std::uncaught_exceptions(); GKO_ASSERT_NO_CUBLAS_ERRORS( cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); } @@ -50,7 +50,7 @@ class pointer_mode_guard { ~pointer_mode_guard() noexcept(false) { /* Ignore the error during stack unwinding for this call */ - if (xstd::uncaught_exception()) { + if (std::uncaught_exception() > uncaught_exceptions_) { cublasSetPointerMode(*l_handle, CUBLAS_POINTER_MODE_DEVICE); } else { GKO_ASSERT_NO_CUBLAS_ERRORS( @@ -59,6 +59,7 @@ class pointer_mode_guard { } private: + int uncaught_exceptions_; cublasHandle_t* l_handle; }; @@ -82,6 +83,7 @@ class pointer_mode_guard { pointer_mode_guard(cusparseHandle_t handle) { l_handle = handle; + uncaught_exceptions_ = std::uncaught_exceptions(); GKO_ASSERT_NO_CUSPARSE_ERRORS( cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST)); } @@ -97,7 +99,7 @@ class pointer_mode_guard { ~pointer_mode_guard() noexcept(false) { /* Ignore the error during stack unwinding for this call */ - if (xstd::uncaught_exception()) { + if (std::uncaught_exceptions() > uncaught_exceptions_) { cusparseSetPointerMode(l_handle, CUSPARSE_POINTER_MODE_DEVICE); } else { GKO_ASSERT_NO_CUSPARSE_ERRORS( @@ -106,6 +108,7 @@ class pointer_mode_guard { } private: + int uncaught_exceptions_; cusparseHandle_t l_handle; }; diff --git a/dpcpp/reorder/rcm_kernels.dp.cpp b/dpcpp/reorder/rcm_kernels.dp.cpp index 350b4c90a6d..2985f1a0dc7 100644 --- a/dpcpp/reorder/rcm_kernels.dp.cpp +++ b/dpcpp/reorder/rcm_kernels.dp.cpp @@ -7,7 +7,6 @@ #include #include -#include #include #include #include diff --git a/examples/custom-matrix-format/CMakeLists.txt b/examples/custom-matrix-format/CMakeLists.txt index ad6b9c2950f..a9ad895a996 100644 --- a/examples/custom-matrix-format/CMakeLists.txt +++ b/examples/custom-matrix-format/CMakeLists.txt @@ -12,7 +12,7 @@ if(NOT (GINKGO_BUILD_CUDA AND GINKGO_BUILD_OMP)) "This example needs Ginkgo built with CUDA and OpenMP support") endif() -set(CMAKE_CUDA_STANDARD 14) +set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD_REQUIRED ON) add_executable(custom-matrix-format custom-matrix-format.cpp stencil_kernel.cu) diff --git a/hip/base/pointer_mode_guard.hip.hpp b/hip/base/pointer_mode_guard.hip.hpp index d14c8468c0b..ea03758e087 100644 --- a/hip/base/pointer_mode_guard.hip.hpp +++ b/hip/base/pointer_mode_guard.hip.hpp @@ -20,7 +20,6 @@ #include #include -#include #include "common/cuda_hip/base/runtime.hpp" @@ -44,6 +43,7 @@ class pointer_mode_guard { pointer_mode_guard(hipblasContext* handle) { l_handle = handle; + uncaught_exceptions_ = std::uncaught_exceptions(); GKO_ASSERT_NO_HIPBLAS_ERRORS( hipblasSetPointerMode(reinterpret_cast(handle), HIPBLAS_POINTER_MODE_HOST)); @@ -60,7 +60,7 @@ class pointer_mode_guard { ~pointer_mode_guard() noexcept(false) { /* Ignore the error during stack unwinding for this call */ - if (xstd::uncaught_exception()) { + if (std::uncaught_exceptions() > uncaught_exceptions_) { hipblasSetPointerMode(reinterpret_cast(l_handle), HIPBLAS_POINTER_MODE_DEVICE); } else { @@ -71,6 +71,7 @@ class pointer_mode_guard { } private: + int uncaught_exceptions_; hipblasContext* l_handle; }; @@ -94,6 +95,7 @@ class pointer_mode_guard { pointer_mode_guard(hipsparseContext* handle) { l_handle = handle; + uncaught_exceptions_ = std::uncaught_exceptions(); GKO_ASSERT_NO_HIPSPARSE_ERRORS( hipsparseSetPointerMode(reinterpret_cast(handle), HIPSPARSE_POINTER_MODE_HOST)); @@ -110,7 +112,7 @@ class pointer_mode_guard { ~pointer_mode_guard() noexcept(false) { /* Ignore the error during stack unwinding for this call */ - if (xstd::uncaught_exception()) { + if (std::uncaught_exceptions() > uncaught_exceptions_) { hipsparseSetPointerMode( reinterpret_cast(l_handle), HIPSPARSE_POINTER_MODE_DEVICE); @@ -122,6 +124,7 @@ class pointer_mode_guard { } private: + int uncaught_exceptions_; hipsparseContext* l_handle; }; diff --git a/include/ginkgo/core/base/abstract_factory.hpp b/include/ginkgo/core/base/abstract_factory.hpp index 5c799ab58f1..cbd18cf42d6 100644 --- a/include/ginkgo/core/base/abstract_factory.hpp +++ b/include/ginkgo/core/base/abstract_factory.hpp @@ -6,6 +6,7 @@ #define GKO_PUBLIC_CORE_BASE_ABSTRACT_FACTORY_HPP_ +#include #include #include @@ -563,7 +564,7 @@ private: \ \ public: \ template >...>::value>> \ auto with_##_name(Args&&... factories) \ diff --git a/include/ginkgo/core/base/combination.hpp b/include/ginkgo/core/base/combination.hpp index f3cdea82dcb..a9fb4d565ae 100644 --- a/include/ginkgo/core/base/combination.hpp +++ b/include/ginkgo/core/base/combination.hpp @@ -6,6 +6,7 @@ #define GKO_PUBLIC_CORE_BASE_COMBINATION_HPP_ +#include #include #include @@ -136,7 +137,7 @@ class Combination : public EnableLinOp>, */ template < typename CoefficientIterator, typename OperatorIterator, - typename = xstd::void_t< + typename = std::void_t< typename std::iterator_traits< CoefficientIterator>::iterator_category, typename std::iterator_traits::iterator_category>> diff --git a/include/ginkgo/core/base/composition.hpp b/include/ginkgo/core/base/composition.hpp index e151e121b56..9c16f8720aa 100644 --- a/include/ginkgo/core/base/composition.hpp +++ b/include/ginkgo/core/base/composition.hpp @@ -6,6 +6,7 @@ #define GKO_PUBLIC_CORE_BASE_COMPOSITION_HPP_ +#include #include #include @@ -125,7 +126,7 @@ class Composition : public EnableLinOp>, * @param end iterator pointing behind the last operator */ template ::iterator_category>> explicit Composition(Iterator begin, Iterator end) : EnableLinOp([&] { diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 42eff5a5d40..f7b3b35c3f6 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -877,7 +877,7 @@ namespace detail { * @note * This basically mirrors the accessor functionality */ -template > +template > struct has_to_arithmetic_type : std::false_type { static_assert(std::is_same::value, "Do not modify the Dummy value!"); @@ -886,7 +886,7 @@ struct has_to_arithmetic_type : std::false_type { template struct has_to_arithmetic_type< - Ref, xstd::void_t().to_arithmetic_type())>> + Ref, std::void_t().to_arithmetic_type())>> : std::true_type { using type = decltype(std::declval().to_arithmetic_type()); }; @@ -896,14 +896,14 @@ struct has_to_arithmetic_type< * @internal * Tests if the type `Ref::arithmetic_type` exists */ -template > +template > struct has_arithmetic_type : std::false_type { static_assert(std::is_same::value, "Do not modify the Dummy value!"); }; template -struct has_arithmetic_type> +struct has_arithmetic_type> : std::true_type {}; @@ -1070,17 +1070,16 @@ GKO_INLINE GKO_ATTRIBUTES constexpr auto squared_norm(const T& x) * @return x >= zero() ? x : -x; */ template -GKO_INLINE - GKO_ATTRIBUTES constexpr xstd::enable_if_t::value, T> - abs(const T& x) +GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t::value, T> +abs(const T& x) { return x >= zero() ? x : -x; } template -GKO_INLINE GKO_ATTRIBUTES constexpr xstd::enable_if_t::value, - remove_complex> +GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t::value, + remove_complex> abs(const T& x) { return sqrt(squared_norm(x)); diff --git a/include/ginkgo/core/base/utils_helper.hpp b/include/ginkgo/core/base/utils_helper.hpp index faa74974703..3ea5c9d878d 100644 --- a/include/ginkgo/core/base/utils_helper.hpp +++ b/include/ginkgo/core/base/utils_helper.hpp @@ -12,7 +12,6 @@ #include #include -#include #include @@ -99,7 +98,7 @@ template struct is_clonable_impl : std::false_type {}; template -struct is_clonable_impl().clone())>> +struct is_clonable_impl().clone())>> : std::true_type {}; template @@ -114,7 +113,7 @@ struct is_clonable_to_impl : std::false_type {}; template struct is_clonable_to_impl< - T, xstd::void_t().clone( + T, std::void_t().clone( std::declval>()))>> : std::true_type {}; diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index 4689c3d3381..9e3d45443b1 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -55,7 +55,7 @@ struct is_matrix_type_builder : std::false_type {}; template struct is_matrix_type_builder< Builder, ValueType, IndexType, - gko::xstd::void_t< + std::void_t< decltype(std::declval().template create( std::declval>()))>> : std::true_type {}; diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp index 907bc418906..dd9d30249e9 100644 --- a/include/ginkgo/core/log/logger.hpp +++ b/include/ginkgo/core/log/logger.hpp @@ -796,7 +796,7 @@ class EnableLogging : public PolymorphicBase { template struct propagate_log_helper< Event, ConcreteLoggableT, - xstd::void_t< + std::void_t< decltype(std::declval().get_executor())>> { template static void propagate_log(const ConcreteLoggableT* loggable, diff --git a/include/ginkgo/core/preconditioner/ic.hpp b/include/ginkgo/core/preconditioner/ic.hpp index f78e00eea09..aea43af3cf1 100644 --- a/include/ginkgo/core/preconditioner/ic.hpp +++ b/include/ginkgo/core/preconditioner/ic.hpp @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/include/ginkgo/core/preconditioner/ilu.hpp b/include/ginkgo/core/preconditioner/ilu.hpp index 869681fc547..1f4be3e3046 100644 --- a/include/ginkgo/core/preconditioner/ilu.hpp +++ b/include/ginkgo/core/preconditioner/ilu.hpp @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/include/ginkgo/core/solver/solver_traits.hpp b/include/ginkgo/core/solver/solver_traits.hpp index b5c437716b3..d5306f56b08 100644 --- a/include/ginkgo/core/solver/solver_traits.hpp +++ b/include/ginkgo/core/solver/solver_traits.hpp @@ -6,7 +6,9 @@ #define GKO_PUBLIC_CORE_SOLVER_SOLVER_TRAITS_HPP_ -#include +#include + + #include @@ -33,12 +35,12 @@ struct has_with_criteria : std::false_type {}; * * @internal The second template parameter (which uses SFINAE) must match * the default value of the general case in order to be accepted - * as a specialization, which is why `xstd::void_t` is used. + * as a specialization, which is why `std::void_t` is used. */ template -struct has_with_criteria< - SolverType, xstd::void_t()))>> +struct has_with_criteria()))>> : std::true_type {}; diff --git a/test/test_install/CMakeLists.txt b/test/test_install/CMakeLists.txt index ee19b8d030e..60fad7cf339 100644 --- a/test/test_install/CMakeLists.txt +++ b/test/test_install/CMakeLists.txt @@ -38,7 +38,7 @@ if(GINKGO_BUILD_CUDA) enable_language(CUDA) configure_file(test_install.cpp test_install.cu COPYONLY) add_executable(test_install_cuda ${CMAKE_CURRENT_BINARY_DIR}/test_install.cu) - set_target_properties(test_install_cuda PROPERTIES CUDA_STANDARD 14) + set_target_properties(test_install_cuda PROPERTIES CUDA_STANDARD 17) target_compile_definitions(test_install_cuda PRIVATE HAS_CUDA=1) target_compile_definitions(test_install_cuda PRIVATE HAS_REFERENCE=${HAS_REFERENCE}) target_link_libraries(test_install_cuda PRIVATE Ginkgo::ginkgo) @@ -49,7 +49,7 @@ if(GINKGO_BUILD_HIP) configure_file(test_install.cpp test_install.hip.cpp COPYONLY) set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/test_install.hip.cpp PROPERTIES LANGUAGE HIP) add_executable(test_install_hip ${CMAKE_CURRENT_BINARY_DIR}/test_install.hip.cpp) - set_target_properties(test_install_hip PROPERTIES HIP_STANDARD 14) + set_target_properties(test_install_hip PROPERTIES HIP_STANDARD 17) target_link_libraries(test_install_hip PRIVATE Ginkgo::ginkgo) target_compile_definitions(test_install_hip PRIVATE HAS_HIP=1) From e65798d116701a5815851dda20b990e41b74d75c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 1 May 2024 15:31:40 +0200 Subject: [PATCH 070/448] adress TODOs --- core/log/profiler_hook.hpp | 8 +++----- include/ginkgo/core/log/profiler_hook.hpp | 1 - 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/core/log/profiler_hook.hpp b/core/log/profiler_hook.hpp index 3f4baf80db1..e6104bba932 100644 --- a/core/log/profiler_hook.hpp +++ b/core/log/profiler_hook.hpp @@ -135,14 +135,12 @@ class profiling_scope_guard { profiling_scope_guard(const char* name) { auto functions = log::create_vtune_fns(); - guard_ = std::make_unique( - name, log::profile_event_category::internal, - std::move(functions.first), std::move(functions.second)); + guard_.emplace(name, log::profile_event_category::internal, + std::move(functions.first), std::move(functions.second)); } private: - // TODO17: use std::optional - std::unique_ptr guard_; + std::optional guard_; }; diff --git a/include/ginkgo/core/log/profiler_hook.hpp b/include/ginkgo/core/log/profiler_hook.hpp index ce5e8831f1c..5db0e1275f5 100644 --- a/include/ginkgo/core/log/profiler_hook.hpp +++ b/include/ginkgo/core/log/profiler_hook.hpp @@ -419,7 +419,6 @@ class profiling_scope_guard { profiling_scope_guard(const profiling_scope_guard&) = delete; - // TODO17: unnecessary with guaranteed RVO /** Move-constructs from another scope guard, other will be left empty. */ profiling_scope_guard(profiling_scope_guard&& other); From aa15b066b12a2c99511a1538fcc735db0f788dee Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 1 May 2024 16:14:40 +0200 Subject: [PATCH 071/448] add missing include --- core/log/profiler_hook.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/log/profiler_hook.hpp b/core/log/profiler_hook.hpp index e6104bba932..31d1d1b5a83 100644 --- a/core/log/profiler_hook.hpp +++ b/core/log/profiler_hook.hpp @@ -9,6 +9,9 @@ #include +#include + + namespace gko { namespace log { From a5b2a07050cc5305524aa10c001e9fc58a2fe41a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 1 May 2024 16:27:34 +0200 Subject: [PATCH 072/448] set C++ version where necessary, remove checks where unnecessary ginkgo has the PUBLIC property cxx_std_17, so we don't need to set it in tests. pkg-config doesn't propagate C++ standards --- benchmark/CMakeLists.txt | 1 - cmake/create_test.cmake | 3 --- dpcpp/CMakeLists.txt | 1 - test/test_exportbuild/CMakeLists.txt | 1 - test/test_install/CMakeLists.txt | 1 - test/test_pkgconfig/CMakeLists.txt | 2 +- test/test_subdir/CMakeLists.txt | 1 - 7 files changed, 1 insertion(+), 9 deletions(-) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 306655d2315..de6e74d464c 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -37,7 +37,6 @@ function(ginkgo_benchmark_onemkl_linops type def) add_library(onemkl_linops_${type} utils/dpcpp_linops.dp.cpp) # make the dependency public to catch issues target_compile_definitions(onemkl_linops_${type} PUBLIC ${def}) - target_compile_features(onemkl_linops_${type} PRIVATE cxx_std_17) target_link_libraries(onemkl_linops_${type} PRIVATE Ginkgo::ginkgo MKL::MKL_DPCPP) endfunction() diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 68f5708e829..9ab0c40de20 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -34,7 +34,6 @@ function(ginkgo_set_test_target_properties test_target_name test_library_suffix) target_link_libraries(${test_target_name} PRIVATE ginkgo_gtest_main${test_library_suffix}) endif() endif() - target_compile_features(${test_target_name} PUBLIC cxx_std_14) # we set these properties regardless of the enabled backends, # because unknown properties are ignored set_target_properties(${test_target_name} PROPERTIES HIP_STANDARD 17) @@ -139,7 +138,6 @@ endfunction(ginkgo_create_test) function(ginkgo_create_dpcpp_test test_name) ginkgo_build_test_name(${test_name} test_target_name) add_executable(${test_target_name} ${test_name}.dp.cpp) - target_compile_features(${test_target_name} PUBLIC cxx_std_17) target_compile_options(${test_target_name} PRIVATE ${GINKGO_DPCPP_FLAGS}) gko_add_sycl_to_target(TARGET ${test_target_name} SOURCES ${test_name}.dp.cpp) target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel) @@ -270,7 +268,6 @@ function(ginkgo_create_common_device_test test_name) ginkgo_build_test_name(${test_name} test_target_name) if(GINKGO_BUILD_SYCL) ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN}) - target_compile_features(${test_target_name}_dpcpp PRIVATE cxx_std_17) target_compile_options(${test_target_name}_dpcpp PRIVATE ${GINKGO_DPCPP_FLAGS}) # We need to use a new file to avoid sycl setting in other backends because add_sycl_to_target will change the source property. configure_file(${test_name}.cpp ${test_name}.dp.cpp COPYONLY) diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index ee373243842..851ef9a3dc6 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -104,7 +104,6 @@ gko_add_sycl_to_target(TARGET ginkgo_dpcpp) # Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating # find_package(MKL) everywhere when linking ginkgo (see the MKL example # https://software.intel.com/content/www/us/en/develop/documentation/onemkl-windows-developer-guide/top/getting-started/cmake-config-for-onemkl.html) -target_compile_features(ginkgo_dpcpp PUBLIC cxx_std_17) target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all) # When building ginkgo as a static library, we need to use dpcpp and per_kernel # link option when the program uses a dpcpp related function. diff --git a/test/test_exportbuild/CMakeLists.txt b/test/test_exportbuild/CMakeLists.txt index c65f964d9a0..cb8f0b1225c 100644 --- a/test/test_exportbuild/CMakeLists.txt +++ b/test/test_exportbuild/CMakeLists.txt @@ -7,5 +7,4 @@ find_package(Ginkgo REQUIRED # Here, we use test install without any data. We instantiate the # interface only. add_executable(test_exportbuild ../test_install/test_install.cpp) -target_compile_features(test_exportbuild PUBLIC cxx_std_14) target_link_libraries(test_exportbuild PRIVATE Ginkgo::ginkgo) diff --git a/test/test_install/CMakeLists.txt b/test/test_install/CMakeLists.txt index 60fad7cf339..285c21f271b 100644 --- a/test/test_install/CMakeLists.txt +++ b/test/test_install/CMakeLists.txt @@ -26,7 +26,6 @@ if (GINKGO_BUILD_REFERENCE) set(HAS_REFERENCE 1) endif() add_executable(test_install test_install.cpp) -target_compile_features(test_install PUBLIC cxx_std_14) target_compile_definitions(test_install PRIVATE HAS_REFERENCE=${HAS_REFERENCE}) target_link_libraries(test_install PRIVATE Ginkgo::ginkgo) if(GINKGO_BUILD_MPI) diff --git a/test/test_pkgconfig/CMakeLists.txt b/test/test_pkgconfig/CMakeLists.txt index e904f997f26..12b9fc4dc26 100644 --- a/test/test_pkgconfig/CMakeLists.txt +++ b/test/test_pkgconfig/CMakeLists.txt @@ -8,7 +8,7 @@ pkg_check_modules(GINKGO REQUIRED IMPORTED_TARGET ginkgo) # Here, we use test install without any data. We instantiate the # interface only. add_executable(test_pkgconfig ../test_install/test_install.cpp) -target_compile_features(test_pkgconfig PUBLIC cxx_std_14) +target_compile_features(test_pkgconfig PUBLIC cxx_std_17) # CMake PkgConfig only puts the -l, -L, and -framework into link_libraries and others into link_options # When linking the target, the linking option will be before the compiled object to lead the linking error set_property(TARGET PkgConfig::GINKGO PROPERTY INTERFACE_LINK_LIBRARIES "${GINKGO_LDFLAGS}") diff --git a/test/test_subdir/CMakeLists.txt b/test/test_subdir/CMakeLists.txt index dcf846f4adc..00ae3bc07e2 100644 --- a/test/test_subdir/CMakeLists.txt +++ b/test/test_subdir/CMakeLists.txt @@ -5,5 +5,4 @@ file(CREATE_LINK "${CMAKE_CURRENT_SOURCE_DIR}/../.." "${CMAKE_CURRENT_BINARY_DIR add_subdirectory("${CMAKE_CURRENT_BINARY_DIR}/ginkgo") add_executable(test_subdir ../test_install/test_install.cpp) -target_compile_features(test_subdir PUBLIC cxx_std_14) target_link_libraries(test_subdir PRIVATE Ginkgo::ginkgo) From 8932e1287e818a864998800a8f07b0728fd0de42 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 1 May 2024 17:23:14 +0200 Subject: [PATCH 073/448] remove unsupported compilers --- .gitlab-ci.yml | 160 ++++------------------------------------------ .gitlab/image.yml | 27 -------- 2 files changed, 14 insertions(+), 173 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1866f16406a..88748c95b79 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -93,138 +93,6 @@ trigger_pipeline: # Build jobs # Job with example runs. -# cuda 10.1 and friends -# Build CUDA NVIDIA without omp -# Make sure that our jobs run when HWLOC is -# forcibly switched off -build/cuda101/nompi/clang/cuda_wo_omp/release/shared: - extends: - - .build_template - - .default_variables - - .full_test_condition - - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019 - variables: - CXX_COMPILER: "clang++" - BUILD_CUDA: "ON" - BUILD_HWLOC: "OFF" - BUILD_TYPE: "Release" - CUDA_ARCH: 35 - -# Job with example runs. -# Also explicitly test PAPI SDE -build/cuda101/openmpi/gcc/all/debug/shared: - extends: - - .build_template - - .default_variables - - .quick_test_condition - - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019 - variables: - BUILD_OMP: "ON" - BUILD_CUDA: "ON" - BUILD_MPI: "ON" - MPI_AS_ROOT: "ON" - BUILD_TYPE: "Debug" - BUILD_PAPI_SDE: "ON" - CUDA_ARCH: 35 - -build/cuda101/nompi/clang/all/release/static: - extends: - - .build_template - - .default_variables - - .full_test_condition - - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019 - variables: - CXX_COMPILER: "clang++" - BUILD_OMP: "ON" - BUILD_CUDA: "ON" - BUILD_TYPE: "Release" - BUILD_SHARED_LIBS: "OFF" - CUDA_ARCH: 35 - -# clang-cuda with cuda 10.1 and friends -#build/clang-cuda101/openmpi/clang/cuda/release/shared: -# extends: -# - .build_and_test_template -# - .default_variables -# - .quick_test_condition -# - .use_gko-cuda101-openmpi-gnu8-llvm13-intel2019 -# variables: -# CXX_COMPILER: "clang++" -# CUDA_COMPILER: "clang++" -# BUILD_OMP: "ON" -# BUILD_CUDA: "ON" -# BUILD_MPI: "ON" -# MPI_AS_ROOT: "ON" -# BUILD_HIP: "OFF" -# BUILD_TYPE: "Release" - - -#build/clang-cuda101/nompi/clang/cuda/debug/static: -# extends: -# - .build_and_test_template -# - .default_variables -# - .full_test_condition -# - .use_gko-cuda101-openmpi-gnu8-llvm13-intel2019 -# variables: -# CXX_COMPILER: "clang++" -# CUDA_COMPILER: "clang++" -# BUILD_OMP: "ON" -# BUILD_CUDA: "ON" -# BUILD_TYPE: "Debug" -# FAST_TESTS: "ON" -# BUILD_SHARED_LIBS: "OFF" - - -# cuda 10.2 and friends - -# works when there is no hwloc and tpl hwloc is also switched off. -build/cuda102/nompi/gcc/all/debug/shared: - extends: - - .build_template - - .default_variables - - .full_test_condition - - .use_gko-cuda102-nompi-gnu8-llvm8-intel2019 - variables: - BUILD_OMP: "ON" - BUILD_CUDA: "ON" - BUILD_TYPE: "Debug" - FAST_TESTS: "ON" - BUILD_HWLOC: "OFF" - CUDA_ARCH: 35 - -# Use TPL hwloc when no system hwloc is available -build/cuda102/nompi/clang/all/release/static: - extends: - - .build_template - - .default_variables - - .full_test_condition - - .use_gko-cuda102-nompi-gnu8-llvm8-intel2019 - variables: - CXX_COMPILER: "clang++" - BUILD_OMP: "ON" - BUILD_CUDA: "ON" - BUILD_TYPE: "Release" - BUILD_SHARED_LIBS: "OFF" - CUDA_ARCH: 35 - BUILD_HWLOC: "OFF" - -build/cuda102/nompi/intel/cuda/debug/static: - extends: - - .build_template - - .default_variables - - .full_test_condition - - .use_gko-cuda102-nompi-gnu8-llvm8-intel2019 - variables: - CXX_COMPILER: "icpc" - CXX_FLAGS: "" - BUILD_OMP: "ON" - BUILD_CUDA: "ON" - BUILD_TYPE: "Debug" - FAST_TESTS: "ON" - BUILD_SHARED_LIBS: "OFF" - CUDA_ARCH: 35 - BUILD_HWLOC: "OFF" - # cuda 11.0 and friends on HoreKa with tests build/cuda110/mvapich2/gcc/cuda/debug/shared: extends: @@ -521,13 +389,13 @@ build/nocuda/openmpi/clang/omp/glibcxx-debug-release/shared: # The tests are prohibitively slow in Debug BUILD_TYPE: "Release" -# nocuda with the oldest supported compiler +# nocuda with old compiler build/nocuda/nompi/gcc/omp/release/static: extends: - .build_and_test_template - .default_variables - .quick_test_condition - - .use_gko-nocuda-mvapich2-gnu5-llvm39-intel2019 + - .use_gko-nocuda-nompi-gnu9-llvm8 variables: BUILD_OMP: "ON" BUILD_TYPE: "Release" @@ -538,7 +406,7 @@ build/nocuda-nomixed/nompi/clang/omp/release/static: - .build_and_test_template - .default_variables - .full_test_condition - - .use_gko-nocuda-mvapich2-gnu5-llvm39-intel2019 + - .use_gko-nocuda-nompi-gnu9-llvm8 variables: CXX_COMPILER: "clang++" BUILD_OMP: "ON" @@ -668,7 +536,7 @@ warnings: - .build_template - .default_variables - .full_test_condition - - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019 + - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 variables: BUILD_OMP: "ON" BUILD_CUDA: "ON" @@ -682,7 +550,7 @@ no-circular-deps: - .build_template - .default_variables - .quick_test_condition - - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019 + - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 variables: BUILD_OMP: "ON" BUILD_CUDA: "ON" @@ -709,7 +577,7 @@ clang-tidy: - .build_template - .default_variables - .full_test_condition - - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019 + - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 variables: BUILD_OMP: "ON" BUILD_CUDA: "ON" @@ -722,7 +590,7 @@ iwyu: - .build_template - .default_variables - .full_test_condition - - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019 + - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 variables: BUILD_OMP: "ON" BUILD_CUDA: "ON" @@ -737,7 +605,7 @@ sonarqube_cov_: - .default_variables - .quick_test_short_lived_condition - .before_script_template - - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019 + - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 tags: - private_ci - controller @@ -773,7 +641,7 @@ sonarqube_cov: - .default_variables - .deploy_condition - .before_script_template - - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019 + - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 tags: - private_ci - controller @@ -836,7 +704,7 @@ threadsanitizer: - .default_variables - .deploy_condition - .before_script_template - - .use_gko-cuda101-openmpi-gnu8-llvm13-intel2019 + - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 script: - LD_PRELOAD=/usr/local/lib/libomp.so CC=clang CXX=clang++ @@ -851,7 +719,7 @@ leaksanitizer: - .default_variables - .deploy_condition - .before_script_template - - .use_gko-cuda101-openmpi-gnu8-llvm13-intel2019 + - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 script: - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=LSAN -DCTEST_MEMORYCHECK_TYPE=LeakSanitizer @@ -862,7 +730,7 @@ addresssanitizer: - .default_variables - .deploy_condition - .before_script_template - - .use_gko-cuda101-openmpi-gnu8-llvm13-intel2019 + - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 script: - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=ASAN -DCTEST_MEMORYCHECK_TYPE=AddressSanitizer @@ -873,7 +741,7 @@ undefinedsanitizer: - .default_variables - .deploy_condition - .before_script_template - - .use_gko-cuda101-openmpi-gnu8-llvm13-intel2019 + - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 script: # the Gold linker is required because of a linker flag issues given by UBsan # in the Ubuntu setup we are using. @@ -886,7 +754,7 @@ cudamemcheck: - .before_script_template - .default_variables - .deploy_condition - image: ginkgohub/cuda:101-openmpi-gnu8-llvm13-intel2019 + image: use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 tags: - private_ci - nvidia-gpu diff --git a/.gitlab/image.yml b/.gitlab/image.yml index da548066a86..60521044d7f 100644 --- a/.gitlab/image.yml +++ b/.gitlab/image.yml @@ -17,33 +17,6 @@ - cpu - amdci -.use_gko-nocuda-mvapich2-gnu5-llvm39-intel2019: - image: ginkgohub/cpu:mvapich2-gnu5-llvm39-intel2019 - tags: - - private_ci - - cpu - - controller - -.use_gko-cuda101-openmpi-gnu8-llvm7-intel2019: - image: ginkgohub/cuda:101-openmpi-gnu8-llvm7-intel2019 - tags: - - private_ci - - controller - - cpu - -.use_gko-cuda101-openmpi-gnu8-llvm13-intel2019: - image: ginkgohub/cuda:101-openmpi-gnu8-llvm13-intel2019 - tags: - - private_ci - - nvidia-gpu - -.use_gko-cuda102-nompi-gnu8-llvm8-intel2019: - image: ginkgohub/cuda:102-nompi-gnu8-llvm8-intel2019 - tags: - - private_ci - - controller - - cpu - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020: image: ginkgohub/cuda:110-mvapich2-gnu9-llvm9-intel2020 tags: From 63253cb37b7a6ef5aebe997c76505c1807513cfd Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 4 Jul 2024 14:09:50 +0200 Subject: [PATCH 074/448] use compile features for specifying HIP/CUDA standard version --- cmake/build_helpers.cmake | 10 ++++++---- examples/custom-matrix-format/CMakeLists.txt | 3 --- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake index f0337839a55..0985f089382 100644 --- a/cmake/build_helpers.cmake +++ b/cmake/build_helpers.cmake @@ -19,10 +19,12 @@ endfunction() function(ginkgo_compile_features name) target_compile_features("${name}" PUBLIC cxx_std_17) - # we set these properties regardless of the enabled backends, - # because unknown properties are ignored - set_target_properties("${name}" PROPERTIES HIP_STANDARD 17) - set_target_properties("${name}" PROPERTIES CUDA_STANDARD 17) + if (GINKG_BUILD_CUDA) + target_compile_features("${name}" PUBLIC cuda_std_17) + endif() + if (GINKG_BUILD_HIP) + target_compile_features("${name}" PUBLIC hip_std_17) + endif() if(GINKGO_WITH_CLANG_TIDY AND GINKGO_CLANG_TIDY_PATH) set_property(TARGET "${name}" PROPERTY CXX_CLANG_TIDY "${GINKGO_CLANG_TIDY_PATH};-checks=*") endif() diff --git a/examples/custom-matrix-format/CMakeLists.txt b/examples/custom-matrix-format/CMakeLists.txt index a9ad895a996..9a1280ff9f5 100644 --- a/examples/custom-matrix-format/CMakeLists.txt +++ b/examples/custom-matrix-format/CMakeLists.txt @@ -12,9 +12,6 @@ if(NOT (GINKGO_BUILD_CUDA AND GINKGO_BUILD_OMP)) "This example needs Ginkgo built with CUDA and OpenMP support") endif() -set(CMAKE_CUDA_STANDARD 17) -set(CMAKE_CUDA_STANDARD_REQUIRED ON) - add_executable(custom-matrix-format custom-matrix-format.cpp stencil_kernel.cu) target_link_libraries(custom-matrix-format Ginkgo::ginkgo OpenMP::OpenMP_CXX) From 99748647acdb1e3ba893bf1e5ba627a736441f90 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 5 Jul 2024 11:47:01 +0200 Subject: [PATCH 075/448] update version requirements --- README.md | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index d5e22bd0b35..74fd6a0f57e 100644 --- a/README.md +++ b/README.md @@ -40,10 +40,10 @@ For Ginkgo core library: * _cmake 3.16+_ * C++17 compliant compiler, one of: - * _gcc 5.5+_ - * _clang 3.9+_ + * _gcc 7+_ + * _clang 5+_ * _Intel compiler 2019+_ - * _Apple Clang 14.0_ is tested. Earlier versions might also work. + * _Apple Clang 15.0_ is tested. Earlier versions might also work. * _Cray Compiler 14.0.1+_ * _NVHPC Compiler 22.7+_ @@ -59,9 +59,7 @@ The Ginkgo CUDA module has the following __additional__ requirements: The Ginkgo HIP module has the following __additional__ requirements: * _ROCm 4.5+_ -* the HIP, hipBLAS, hipSPARSE, hip/rocRAND and rocThrust packages compiled with either: - * _AMD_ backend (using the `clang` compiler) - * _10.1 <= CUDA < 11_ backend +* the HIP, hipBLAS, hipSPARSE, hip/rocRAND and rocThrust packages compiled with the ROCm backend * if the hipFFT package is available, it is used to implement the FFT LinOps. * _cmake 3.21+_ @@ -69,7 +67,6 @@ The Ginkgo DPC++(SYCL) module has the following __additional__ requirements: * _oneAPI 2023.1+_ * Set `dpcpp` or `icpx` as the `CMAKE_CXX_COMPILER` -* `c++17` is used to compile Ginkgo * The following oneAPI packages should be available: * oneMKL * oneDPL @@ -81,7 +78,7 @@ The Ginkgo MPI module has the following __additional__ requirements: In addition, if you want to contribute code to Ginkgo, you will also need the following: -* _clang-format 8.0.0+_ (ships as part of _clang_) +* _clang-format 14_ (downloaded automatically by `pre-commit`) * _clang-tidy_ (optional, when setting the flag `-DGINKGO_WITH_CLANG_TIDY=ON`) * _iwyu_ (Include What You Use, optional, when setting the flag `-DGINKGO_WITH_IWYU=ON`) @@ -89,7 +86,7 @@ following: * _cmake 3.16+_ * C++17 compliant 64-bit compiler: - * _MinGW : gcc 5.5+_ + * _MinGW : gcc 7+_ * _Microsoft Visual Studio : VS 2019+_ The Ginkgo CUDA module has the following __additional__ requirements: From 5bed30de4666b1414214823e49acdd7a929fc304 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 5 Jul 2024 11:51:16 +0200 Subject: [PATCH 076/448] remove older std_extensions --- cuda/base/pointer_mode_guard.hpp | 2 +- include/ginkgo/core/base/std_extensions.hpp | 39 ++++----------------- 2 files changed, 8 insertions(+), 33 deletions(-) diff --git a/cuda/base/pointer_mode_guard.hpp b/cuda/base/pointer_mode_guard.hpp index 56f46fedf40..6340b98eb6f 100644 --- a/cuda/base/pointer_mode_guard.hpp +++ b/cuda/base/pointer_mode_guard.hpp @@ -50,7 +50,7 @@ class pointer_mode_guard { ~pointer_mode_guard() noexcept(false) { /* Ignore the error during stack unwinding for this call */ - if (std::uncaught_exception() > uncaught_exceptions_) { + if (std::uncaught_exceptions() > uncaught_exceptions_) { cublasSetPointerMode(*l_handle, CUBLAS_POINTER_MODE_DEVICE); } else { GKO_ASSERT_NO_CUBLAS_ERRORS( diff --git a/include/ginkgo/core/base/std_extensions.hpp b/include/ginkgo/core/base/std_extensions.hpp index 85857873f24..842ad86a23f 100644 --- a/include/ginkgo/core/base/std_extensions.hpp +++ b/include/ginkgo/core/base/std_extensions.hpp @@ -11,6 +11,8 @@ #include #include +#include "ginkgo/core/base/types.hpp" + // This header provides implementations of useful utilities introduced into the // C++ standard after C++14 (e.g. C++17 and C++20). @@ -25,33 +27,12 @@ namespace gko { * @ingroup xstd */ namespace xstd { -namespace detail { - - -template -struct make_void { - using type = void; -}; - - -} // namespace detail - - -// Added in C++17 template -using void_t = typename detail::make_void::type; +using void_t = std::void_t; -// Disable deprecation warnings when using standard > C++14 -inline bool uncaught_exception() noexcept -{ -// MSVC uses _MSVC_LANG as __cplusplus -#if (defined(_MSVC_LANG) && _MSVC_LANG > 201402L) || __cplusplus > 201402L - return std::uncaught_exceptions() > 0; -#else - return std::uncaught_exception(); -#endif -} +GKO_DEPRECATED("use std::uncaught_exceptions") +inline bool uncaught_exception() noexcept { return std::uncaught_exception(); } // Kept for backward compatibility. @@ -101,14 +82,8 @@ constexpr bool less_equal(const T&& lhs, const T&& rhs) } -// available in with C++17 -template -struct conjunction : std::true_type {}; -template -struct conjunction : B1 {}; -template -struct conjunction - : std::conditional_t, B1> {}; +template +using conjunction = std::conjunction; } // namespace xstd From de37ab9e8cdb0912ec7171ccbff6b458c6576b0d Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 5 Jul 2024 11:53:13 +0200 Subject: [PATCH 077/448] remove unsupported Intel + CUDA build --- .gitlab-ci.yml | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 88748c95b79..687b517bf78 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -159,40 +159,6 @@ test/cuda110/nompi/clang/cuda/release/static: needs: [ "build/cuda110/nompi/clang/cuda/release/static" ] -build/cuda110/nompi/intel/cuda/debug/static: - extends: - - .build_template - - .default_variables - - .full_test_condition - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 - variables: - CXX_COMPILER: "icpc" - CXX_FLAGS: "" - BUILD_OMP: "ON" - BUILD_CUDA: "ON" - BUILD_TYPE: "Debug" - FAST_TESTS: "ON" - BUILD_SHARED_LIBS: "OFF" - CUDA_ARCH: 80 - USE_NAME: "cuda110-nompi-intel-${CI_PIPELINE_ID}" - KEEP_CONTAINER: "ON" - USE_SLURM: 0 - -test/cuda110/nompi/intel/cuda/debug/static: - extends: - - .horeka_test_template - - .default_variables - - .full_test_condition - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 - variables: - USE_NAME: "cuda110-nompi-intel-${CI_PIPELINE_ID}" - SLURM_PARTITION: "accelerated" - SLURM_GRES: "gpu:4" - SLURM_TIME: "02:00:00" - dependencies: null - needs: [ "build/cuda110/nompi/intel/cuda/debug/static" ] - - # cuda 11.4 and friends build/cuda114/nompi/gcc/cuda/debug/shared: extends: From 43f1dcb964d2fd172d75b52904cae6482bf8db72 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 5 Jul 2024 12:22:59 +0200 Subject: [PATCH 078/448] review updates and formatting --- .gitlab-ci.yml | 2 +- core/log/profiler_hook.cpp | 7 ------- core/log/profiler_hook.hpp | 5 ++--- include/ginkgo/core/base/std_extensions.hpp | 5 ++++- include/ginkgo/core/log/profiler_hook.hpp | 3 +-- include/ginkgo/core/solver/solver_traits.hpp | 1 - 6 files changed, 8 insertions(+), 15 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 687b517bf78..8fd46cac12f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -720,7 +720,7 @@ cudamemcheck: - .before_script_template - .default_variables - .deploy_condition - image: use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 + image: ginkgohub/cuda:110-mvapich2-gnu9-llvm9-intel2020 tags: - private_ci - nvidia-gpu diff --git a/core/log/profiler_hook.cpp b/core/log/profiler_hook.cpp index 7cb4f807919..e3ed0ad8299 100644 --- a/core/log/profiler_hook.cpp +++ b/core/log/profiler_hook.cpp @@ -431,13 +431,6 @@ profiling_scope_guard::~profiling_scope_guard() } } -profiling_scope_guard::profiling_scope_guard(profiling_scope_guard&& other) - : empty_{std::exchange(other.empty_, true)}, - name_{std::exchange(other.name_, nullptr)}, - category_{other.category_}, - end_{std::move(other.end_)} -{} - } // namespace log } // namespace gko diff --git a/core/log/profiler_hook.hpp b/core/log/profiler_hook.hpp index 31d1d1b5a83..c4e31c76ef3 100644 --- a/core/log/profiler_hook.hpp +++ b/core/log/profiler_hook.hpp @@ -6,11 +6,10 @@ #define GKO_CORE_LOG_PROFILER_HOOK_HPP_ -#include - - #include +#include + namespace gko { namespace log { diff --git a/include/ginkgo/core/base/std_extensions.hpp b/include/ginkgo/core/base/std_extensions.hpp index 842ad86a23f..893b2b0d865 100644 --- a/include/ginkgo/core/base/std_extensions.hpp +++ b/include/ginkgo/core/base/std_extensions.hpp @@ -32,7 +32,10 @@ using void_t = std::void_t; GKO_DEPRECATED("use std::uncaught_exceptions") -inline bool uncaught_exception() noexcept { return std::uncaught_exception(); } +inline bool uncaught_exception() noexcept +{ + return std::uncaught_exceptions() > 0; +} // Kept for backward compatibility. diff --git a/include/ginkgo/core/log/profiler_hook.hpp b/include/ginkgo/core/log/profiler_hook.hpp index 5db0e1275f5..c5dc9dcbab6 100644 --- a/include/ginkgo/core/log/profiler_hook.hpp +++ b/include/ginkgo/core/log/profiler_hook.hpp @@ -419,8 +419,7 @@ class profiling_scope_guard { profiling_scope_guard(const profiling_scope_guard&) = delete; - /** Move-constructs from another scope guard, other will be left empty. */ - profiling_scope_guard(profiling_scope_guard&& other); + profiling_scope_guard(profiling_scope_guard&& other) = delete; profiling_scope_guard& operator=(const profiling_scope_guard&) = delete; diff --git a/include/ginkgo/core/solver/solver_traits.hpp b/include/ginkgo/core/solver/solver_traits.hpp index d5306f56b08..6209cad3e90 100644 --- a/include/ginkgo/core/solver/solver_traits.hpp +++ b/include/ginkgo/core/solver/solver_traits.hpp @@ -8,7 +8,6 @@ #include - #include From d7066224c541356e19c703c9f0e07d361814702c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 8 Jul 2024 15:17:07 +0200 Subject: [PATCH 079/448] add build-only job for SM 3.5 --- .gitlab-ci.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8fd46cac12f..055a7988a0c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -175,6 +175,25 @@ build/cuda114/nompi/gcc/cuda/debug/shared: CXX_FLAGS: "-Wno-error=maybe-uninitialized" # disable spurious unused argument warning EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177" + + +build/cuda114/nompi/clang/cuda/release/shared: + extends: + - .build_template + - .default_variables + - .quick_test_condition + - .use_gko_cuda114-openmpi-gnu10-llvm12 + variables: + CXX_COMPILER: "clang++" + CUDA_ARCH: 35 + BUILD_OMP: "ON" + BUILD_CUDA: "ON" + BUILD_TYPE: "Release" + FAST_TESTS: "ON" + # fix gtest issue https://github.com/google/googletest/issues/3514 + CXX_FLAGS: "-Wno-error=maybe-uninitialized" + # disable spurious unused argument warning + EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177" # nvhpc and friends From f148900eab1c0c48ec20522b3b31a4cd16748f69 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 8 Jul 2024 16:48:47 +0200 Subject: [PATCH 080/448] remove warning flags --- .gitlab-ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 055a7988a0c..03e4b5ad4d0 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -190,8 +190,6 @@ build/cuda114/nompi/clang/cuda/release/shared: BUILD_CUDA: "ON" BUILD_TYPE: "Release" FAST_TESTS: "ON" - # fix gtest issue https://github.com/google/googletest/issues/3514 - CXX_FLAGS: "-Wno-error=maybe-uninitialized" # disable spurious unused argument warning EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177" From b46a653a528965b1064142bb808b3383bde4b66b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 8 Jul 2024 21:28:18 +0200 Subject: [PATCH 081/448] move to a compatible nvcc/clang combination nvcc with clang++ host compiler seems incompatible with libstdc++-10 --- .gitlab-ci.yml | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 03e4b5ad4d0..2f8e3a892a5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -157,39 +157,39 @@ test/cuda110/nompi/clang/cuda/release/static: SLURM_TIME: "01:30:00" dependencies: null needs: [ "build/cuda110/nompi/clang/cuda/release/static" ] + - -# cuda 11.4 and friends -build/cuda114/nompi/gcc/cuda/debug/shared: +build/cuda110/nompi/clang/cuda/release/shared: extends: - - .build_and_test_template + - .build_template - .default_variables - .quick_test_condition - - .use_gko_cuda114-openmpi-gnu10-llvm12 + - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 variables: + CXX_COMPILER: "clang++" + CUDA_ARCH: 52 BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_TYPE: "Debug" + BUILD_TYPE: "Release" FAST_TESTS: "ON" - # fix gtest issue https://github.com/google/googletest/issues/3514 - CXX_FLAGS: "-Wno-error=maybe-uninitialized" # disable spurious unused argument warning EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177" - -build/cuda114/nompi/clang/cuda/release/shared: + +# cuda 11.4 and friends +build/cuda114/nompi/gcc/cuda/debug/shared: extends: - - .build_template + - .build_and_test_template - .default_variables - .quick_test_condition - .use_gko_cuda114-openmpi-gnu10-llvm12 variables: - CXX_COMPILER: "clang++" - CUDA_ARCH: 35 BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_TYPE: "Release" + BUILD_TYPE: "Debug" FAST_TESTS: "ON" + # fix gtest issue https://github.com/google/googletest/issues/3514 + CXX_FLAGS: "-Wno-error=maybe-uninitialized" # disable spurious unused argument warning EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177" From 1c039d3694a72db3f549c23e2a5801134bed06fc Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 1 May 2024 15:28:59 +0200 Subject: [PATCH 082/448] add device support to zip_iterator --- core/base/iterator_factory.hpp | 363 +++++++++++++++--- core/test/base/iterator_factory.cpp | 19 +- omp/distributed/partition_helpers_kernels.cpp | 7 +- omp/matrix/csr_kernels.cpp | 5 +- omp/matrix/fbcsr_kernels.cpp | 5 +- omp/multigrid/pgm_kernels.cpp | 3 +- .../distributed/partition_helpers_kernels.cpp | 13 +- reference/matrix/csr_kernels.cpp | 5 +- reference/matrix/fbcsr_kernels.cpp | 5 +- reference/multigrid/pgm_kernels.cpp | 3 +- test/base/CMakeLists.txt | 1 + test/base/iterator_factory.cpp | 69 ++++ 12 files changed, 410 insertions(+), 88 deletions(-) create mode 100644 test/base/iterator_factory.cpp diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp index 3d224836b1a..938d705b04d 100644 --- a/core/base/iterator_factory.hpp +++ b/core/base/iterator_factory.hpp @@ -24,6 +24,234 @@ template class zip_iterator; +template +class zip_iterator_reference; + + +template +class device_tuple; + + +} // namespace detail +} // namespace gko + + +// structured binding specializations for device_tuple, zip_iterator_reference +namespace std { + + +template +struct tuple_size> + : integral_constant {}; + + +template +struct tuple_element> { + using type = typename tuple_element>::type; +}; + + +template +struct tuple_size> + : integral_constant {}; + + +template +struct tuple_element> { + using type = typename iterator_traits< + typename tuple_element>::type>::reference; +}; + + +} // namespace std + + +namespace gko { + + +/** std::get reimplementation for device_tuple. */ +template +constexpr typename std::tuple_element>::type& +get(detail::device_tuple& tuple); + + +/** std::get reimplementation for const device_tuple. */ +template +constexpr const typename std::tuple_element>::type& +get(const detail::device_tuple& tuple); + + +namespace detail { + + +/** simplified constexpr std::tuple reimplementation for use in device code. */ +template +class device_tuple { +public: + /** Constructs a device tuple from its elements. */ + constexpr explicit device_tuple(T value, Ts... others) + : value_{value}, other_{others...} + {} + + device_tuple() = default; + + /** + * Copy-assigns a tuple. + * This is necessary to make tuples of references work, which normally cause + * the impliciy copy-assignment operator to be deleted. + */ + constexpr device_tuple& operator=(const device_tuple& other) + { + value_ = other.value_; + other_ = other.other_; + return *this; + } + + /** @return the index-th element in the tuple. */ + template + constexpr typename std::tuple_element::type& get() + { + if constexpr (index == 0) { + return value_; + } else { + return other_.template get(); + } + } + + /** @return the index-th element in the const tuple. */ + template + constexpr const typename std::tuple_element::type& + get() const + { + if constexpr (index == 0) { + return value_; + } else { + return other_.template get(); + } + } + + // comparison operators + constexpr friend bool operator<(const device_tuple& lhs, + const device_tuple& rhs) + { + return lhs.value_ < rhs.value_ || + (lhs.value_ == rhs.value_ && lhs.other_ < rhs.other_); + } + + constexpr friend bool operator>(const device_tuple& lhs, + const device_tuple& rhs) + { + return rhs < lhs; + } + + constexpr friend bool operator>=(const device_tuple& lhs, + const device_tuple& rhs) + { + return !(lhs < rhs); + } + + constexpr friend bool operator<=(const device_tuple& lhs, + const device_tuple& rhs) + { + return !(lhs > rhs); + } + + constexpr friend bool operator==(const device_tuple& lhs, + const device_tuple& rhs) + { + return lhs.value_ == rhs.value_ && lhs.other_ == rhs.other_; + } + + constexpr friend bool operator!=(const device_tuple& lhs, + const device_tuple& rhs) + { + return !(lhs == rhs); + } + +private: + T value_; + device_tuple other_; +}; + + +template +class device_tuple { +public: + /** Constructs a device tuple from its elements. */ + constexpr explicit device_tuple(T value) : value_{value} {} + + device_tuple() = default; + + /** + * Copy-assigns a tuple. + * This is necessary to make tuples of references work, which normally cause + * the impliciy copy-assignment operator to be deleted. + */ + constexpr device_tuple& operator=(const device_tuple& other) + { + value_ = other.value_; + return *this; + } + + /** @return the index-th element in the tuple. */ + template + constexpr T& get() + { + static_assert(index == 0, "invalid index"); + return value_; + } + + /** @return the index-th element in the const tuple. */ + template + constexpr const T& get() const + { + static_assert(index == 0, "invalid index"); + return value_; + } + + // comparison operators + constexpr friend bool operator<(const device_tuple& lhs, + const device_tuple& rhs) + { + return lhs.value_ < rhs.value_; + } + + constexpr friend bool operator>(const device_tuple& lhs, + const device_tuple& rhs) + { + return rhs < lhs; + } + + constexpr friend bool operator>=(const device_tuple& lhs, + const device_tuple& rhs) + { + return !(lhs < rhs); + } + + constexpr friend bool operator<=(const device_tuple& lhs, + const device_tuple& rhs) + { + return !(lhs > rhs); + } + + constexpr friend bool operator==(const device_tuple& lhs, + const device_tuple& rhs) + { + return lhs.value_ == rhs.value_; + } + + constexpr friend bool operator!=(const device_tuple& lhs, + const device_tuple& rhs) + { + return !(lhs == rhs); + } + +private: + T value_; +}; + + /** * A reference-like type pointing to a tuple of elements originating from a * tuple of iterators. A few caveats related to its use: @@ -45,45 +273,51 @@ class zip_iterator; */ template class zip_iterator_reference - : public std::tuple< + : public device_tuple< typename std::iterator_traits::reference...> { using ref_tuple_type = - std::tuple::reference...>; + device_tuple::reference...>; using value_type = - std::tuple::value_type...>; + device_tuple::value_type...>; using index_sequence = std::index_sequence_for; friend class zip_iterator; template - value_type cast_impl(std::index_sequence) const + constexpr value_type cast_impl(std::index_sequence) const { // gcc 5 throws error as using uninitialized array // std::tuple t = { 1, '2' }; is not allowed. // converting to 'std::tuple<...>' from initializer list would use // explicit constructor - return value_type(std::get(*this)...); + return value_type(get(*this)...); } template - void assign_impl(std::index_sequence, const value_type& other) + constexpr void assign_impl(std::index_sequence, + const value_type& other) { (void)std::initializer_list{ - (std::get(*this) = std::get(other), 0)...}; + (get(*this) = get(other), 0)...}; } - zip_iterator_reference(Iterators... it) : ref_tuple_type{*it...} {} + constexpr explicit zip_iterator_reference(Iterators... it) + : ref_tuple_type{*it...} + {} public: - operator value_type() const { return cast_impl(index_sequence{}); } + constexpr operator value_type() const + { + return cast_impl(index_sequence{}); + } - zip_iterator_reference& operator=(const value_type& other) + constexpr zip_iterator_reference& operator=(const value_type& other) { assign_impl(index_sequence{}, other); return *this; } - value_type copy() const { return *this; } + constexpr value_type copy() const { return *this; } }; @@ -123,153 +357,156 @@ class zip_iterator { public: using difference_type = std::ptrdiff_t; using value_type = - std::tuple::value_type...>; + device_tuple::value_type...>; using pointer = value_type*; using reference = zip_iterator_reference; using iterator_category = std::random_access_iterator_tag; using index_sequence = std::index_sequence_for; - explicit zip_iterator() = default; + constexpr zip_iterator() = default; - explicit zip_iterator(Iterators... its) : iterators_{its...} {} + constexpr explicit zip_iterator(Iterators... its) : iterators_{its...} {} - zip_iterator& operator+=(difference_type i) + constexpr zip_iterator& operator+=(difference_type i) { forall([i](auto& it) { it += i; }); return *this; } - zip_iterator& operator-=(difference_type i) + constexpr zip_iterator& operator-=(difference_type i) { forall([i](auto& it) { it -= i; }); return *this; } - zip_iterator& operator++() + constexpr zip_iterator& operator++() { forall([](auto& it) { it++; }); return *this; } - zip_iterator operator++(int) + constexpr zip_iterator operator++(int) { auto tmp = *this; ++(*this); return tmp; } - zip_iterator& operator--() + constexpr zip_iterator& operator--() { forall([](auto& it) { it--; }); return *this; } - zip_iterator operator--(int) + constexpr zip_iterator operator--(int) { auto tmp = *this; --(*this); return tmp; } - zip_iterator operator+(difference_type i) const + constexpr zip_iterator operator+(difference_type i) const { auto tmp = *this; tmp += i; return tmp; } - friend zip_iterator operator+(difference_type i, const zip_iterator& iter) + constexpr friend zip_iterator operator+(difference_type i, + const zip_iterator& iter) { return iter + i; } - zip_iterator operator-(difference_type i) const + constexpr zip_iterator operator-(difference_type i) const { auto tmp = *this; tmp -= i; return tmp; } - difference_type operator-(const zip_iterator& other) const + constexpr difference_type operator-(const zip_iterator& other) const { return forall_check_consistent( other, [](const auto& a, const auto& b) { return a - b; }); } - reference operator*() const + constexpr reference operator*() const { return deref_impl(std::index_sequence_for{}); } - reference operator[](difference_type i) const { return *(*this + i); } + constexpr reference operator[](difference_type i) const + { + return *(*this + i); + } - bool operator==(const zip_iterator& other) const + constexpr bool operator==(const zip_iterator& other) const { return forall_check_consistent( other, [](const auto& a, const auto& b) { return a == b; }); } - bool operator!=(const zip_iterator& other) const + constexpr bool operator!=(const zip_iterator& other) const { return !(*this == other); } - bool operator<(const zip_iterator& other) const + constexpr bool operator<(const zip_iterator& other) const { return forall_check_consistent( other, [](const auto& a, const auto& b) { return a < b; }); } - bool operator<=(const zip_iterator& other) const + constexpr bool operator<=(const zip_iterator& other) const { return forall_check_consistent( other, [](const auto& a, const auto& b) { return a <= b; }); } - bool operator>(const zip_iterator& other) const + constexpr bool operator>(const zip_iterator& other) const { return !(*this <= other); } - bool operator>=(const zip_iterator& other) const + constexpr bool operator>=(const zip_iterator& other) const { return !(*this < other); } private: template - reference deref_impl(std::index_sequence) const + constexpr reference deref_impl(std::index_sequence) const { - return reference{std::get(iterators_)...}; + return reference{get(iterators_)...}; } template - void forall(Functor fn) + constexpr void forall(Functor fn) { forall_impl(fn, index_sequence{}); } template - void forall_impl(Functor fn, std::index_sequence) + constexpr void forall_impl(Functor fn, std::index_sequence) { - (void)std::initializer_list{ - (fn(std::get(iterators_)), 0)...}; + (void)std::initializer_list{(fn(get(iterators_)), 0)...}; } template - void forall_impl(const zip_iterator& other, Functor fn, - std::index_sequence) const + constexpr void forall_impl(const zip_iterator& other, Functor fn, + std::index_sequence) const { (void)std::initializer_list{ - (fn(std::get(iterators_), std::get(other.iterators_)), - 0)...}; + (fn(get(iterators_), get(other.iterators_)), 0)...}; } template - auto forall_check_consistent(const zip_iterator& other, Functor fn) const + constexpr auto forall_check_consistent(const zip_iterator& other, + Functor fn) const { - auto it = std::get<0>(iterators_); - auto other_it = std::get<0>(other.iterators_); + auto it = get<0>(iterators_); + auto other_it = get<0>(other.iterators_); auto result = fn(it, other_it); forall_impl( other, [&](auto a, auto b) { assert(it - other_it == a - b); }, @@ -277,12 +514,13 @@ class zip_iterator { return result; } - std::tuple iterators_; + device_tuple iterators_; }; template -zip_iterator...> make_zip_iterator(Iterators&&... it) +constexpr zip_iterator...> make_zip_iterator( + Iterators&&... it) { return zip_iterator...>{ std::forward(it)...}; @@ -305,8 +543,8 @@ zip_iterator...> make_zip_iterator(Iterators&&... it) * @tparam Iterators the iterator types inside the corresponding zip_iterator */ template -void swap(zip_iterator_reference a, - zip_iterator_reference b) +constexpr void swap(zip_iterator_reference a, + zip_iterator_reference b) { auto tmp = a.copy(); a = b; @@ -318,8 +556,8 @@ void swap(zip_iterator_reference a, * @copydoc swap(zip_iterator_reference, zip_iterator_reference) */ template -void swap(typename zip_iterator::value_type& a, - zip_iterator_reference b) +constexpr void swap(typename zip_iterator::value_type& a, + zip_iterator_reference b) { auto tmp = a; a = b; @@ -331,8 +569,8 @@ void swap(typename zip_iterator::value_type& a, * @copydoc swap(zip_iterator_reference, zip_iterator_reference) */ template -void swap(zip_iterator_reference a, - typename zip_iterator::value_type& b) +constexpr void swap(zip_iterator_reference a, + typename zip_iterator::value_type& b) { auto tmp = a.copy(); a = b; @@ -468,6 +706,25 @@ permute_iterator make_permute_iterator( } // namespace detail + + +template +constexpr typename std::tuple_element>::type& +get(detail::device_tuple& tuple) +{ + return tuple.template get(); +} + + +template +constexpr const typename std::tuple_element>::type& +get(const detail::device_tuple& tuple) +{ + return tuple.template get(); +} + + } // namespace gko diff --git a/core/test/base/iterator_factory.cpp b/core/test/base/iterator_factory.cpp index 42ddff343c0..c4dc30bf219 100644 --- a/core/test/base/iterator_factory.cpp +++ b/core/test/base/iterator_factory.cpp @@ -156,6 +156,7 @@ TYPED_TEST(ZipIterator, IteratorReferenceOperatorSmaller2) TYPED_TEST(ZipIterator, IncreasingIterator) { + using gko::get; using index_type = typename TestFixture::index_type; using value_type = typename TestFixture::value_type; std::vector vec1{this->reversed_index}; @@ -182,8 +183,8 @@ TYPED_TEST(ZipIterator, IncreasingIterator) ASSERT_TRUE(increment_pre_2 == increment_post_2); ASSERT_TRUE(begin == increment_post_test++); ASSERT_TRUE(begin + 1 == ++increment_pre_test); - ASSERT_TRUE(std::get<0>(*plus_2) == vec1[2]); - ASSERT_TRUE(std::get<1>(*plus_2) == vec2[2]); + ASSERT_TRUE(get<0>(*plus_2) == vec1[2]); + ASSERT_TRUE(get<1>(*plus_2) == vec2[2]); // check other comparison operators and difference std::vector> its{ begin, @@ -257,6 +258,7 @@ TYPED_TEST(ZipIterator, IncompatibleIteratorDeathTest) TYPED_TEST(ZipIterator, DecreasingIterator) { + using gko::get; using index_type = typename TestFixture::index_type; using value_type = typename TestFixture::value_type; std::vector vec1{this->reversed_index}; @@ -280,13 +282,14 @@ TYPED_TEST(ZipIterator, DecreasingIterator) ASSERT_TRUE(decrement_pre_2 == decrement_post_2); ASSERT_TRUE(iter == decrement_post_test--); ASSERT_TRUE(iter - 1 == --decrement_pre_test); - ASSERT_TRUE(std::get<0>(*minus_2) == vec1[3]); - ASSERT_TRUE(std::get<1>(*minus_2) == vec2[3]); + ASSERT_TRUE(get<0>(*minus_2) == vec1[3]); + ASSERT_TRUE(get<1>(*minus_2) == vec2[3]); } TYPED_TEST(ZipIterator, CorrectDereferencing) { + using gko::get; using index_type_it = typename TestFixture::index_type; using value_type_it = typename TestFixture::value_type; std::vector vec1{this->reversed_index}; @@ -299,10 +302,10 @@ TYPED_TEST(ZipIterator, CorrectDereferencing) auto to_test_ref = *(begin + element_to_test); value_type to_test_pair = to_test_ref; // Testing implicit conversion - ASSERT_TRUE(std::get<0>(to_test_pair) == vec1[element_to_test]); - ASSERT_TRUE(std::get<0>(to_test_pair) == std::get<0>(to_test_ref)); - ASSERT_TRUE(std::get<1>(to_test_pair) == vec2[element_to_test]); - ASSERT_TRUE(std::get<1>(to_test_pair) == std::get<1>(to_test_ref)); + ASSERT_TRUE(get<0>(to_test_pair) == vec1[element_to_test]); + ASSERT_TRUE(get<0>(to_test_pair) == get<0>(to_test_ref)); + ASSERT_TRUE(get<1>(to_test_pair) == vec2[element_to_test]); + ASSERT_TRUE(get<1>(to_test_pair) == get<1>(to_test_ref)); } diff --git a/omp/distributed/partition_helpers_kernels.cpp b/omp/distributed/partition_helpers_kernels.cpp index ceae3e17679..a3dfa8fdef4 100644 --- a/omp/distributed/partition_helpers_kernels.cpp +++ b/omp/distributed/partition_helpers_kernels.cpp @@ -27,10 +27,9 @@ void sort_by_range_start( range_start_ends.get_data() + 1, [](const auto i) { return 2 * i; }); auto sort_it = detail::make_zip_iterator(start_it, end_it, part_ids_d); // TODO: use TBB or parallel std with c++17 - std::stable_sort(sort_it, sort_it + num_parts, - [](const auto& a, const auto& b) { - return std::get<0>(a) < std::get<0>(b); - }); + std::stable_sort( + sort_it, sort_it + num_parts, + [](const auto& a, const auto& b) { return get<0>(a) < get<0>(b); }); } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 09d1465896b..8e47caef520 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -1155,9 +1155,8 @@ void sort_by_column_index(std::shared_ptr exec, auto row_nnz = row_ptrs[i + 1] - start_row_idx; auto it = detail::make_zip_iterator(col_idxs + start_row_idx, values + start_row_idx); - std::sort(it, it + row_nnz, [](auto t1, auto t2) { - return std::get<0>(t1) < std::get<0>(t2); - }); + std::sort(it, it + row_nnz, + [](auto t1, auto t2) { return get<0>(t1) < get<0>(t2); }); } } diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp index db60d85db79..a6342034a56 100644 --- a/omp/matrix/fbcsr_kernels.cpp +++ b/omp/matrix/fbcsr_kernels.cpp @@ -398,9 +398,8 @@ void sort_by_column_index_impl( std::vector col_permute(nbnz_brow); std::iota(col_permute.begin(), col_permute.end(), 0); auto it = detail::make_zip_iterator(brow_col_idxs, col_permute.data()); - std::sort(it, it + nbnz_brow, [](auto a, auto b) { - return std::get<0>(a) < std::get<0>(b); - }); + std::sort(it, it + nbnz_brow, + [](auto a, auto b) { return get<0>(a) < get<0>(b); }); std::vector oldvalues(nbnz_brow * bs2); std::copy(brow_vals, brow_vals + nbnz_brow * bs2, oldvalues.begin()); diff --git a/omp/multigrid/pgm_kernels.cpp b/omp/multigrid/pgm_kernels.cpp index 9d2aa047cc4..4c824a0140b 100644 --- a/omp/multigrid/pgm_kernels.cpp +++ b/omp/multigrid/pgm_kernels.cpp @@ -43,8 +43,7 @@ void sort_row_major(std::shared_ptr exec, size_type nnz, { auto it = detail::make_zip_iterator(row_idxs, col_idxs, vals); std::stable_sort(it, it + nnz, [](auto a, auto b) { - return std::tie(std::get<0>(a), std::get<1>(a)) < - std::tie(std::get<0>(b), std::get<1>(b)); + return std::tie(get<0>(a), get<1>(a)) < std::tie(get<0>(b), get<1>(b)); }); } diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp index b57daab2eaa..0307974f278 100644 --- a/reference/distributed/partition_helpers_kernels.cpp +++ b/reference/distributed/partition_helpers_kernels.cpp @@ -26,10 +26,9 @@ void sort_by_range_start( auto end_it = detail::make_permute_iterator( range_start_ends.get_data() + 1, [](const auto i) { return 2 * i; }); auto sort_it = detail::make_zip_iterator(start_it, end_it, part_ids_d); - std::stable_sort(sort_it, sort_it + num_parts, - [](const auto& a, const auto& b) { - return std::get<0>(a) < std::get<0>(b); - }); + std::stable_sort( + sort_it, sort_it + num_parts, + [](const auto& a, const auto& b) { return get<0>(a) < get<0>(b); }); } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( @@ -51,9 +50,9 @@ void check_consecutive_ranges(std::shared_ptr exec, auto range_it = detail::make_zip_iterator(start_it, end_it); if (num_parts) { - result = std::all_of( - range_it, range_it + num_parts - 1, - [](const auto& r) { return std::get<0>(r) == std::get<1>(r); }); + result = + std::all_of(range_it, range_it + num_parts - 1, + [](const auto& r) { return get<0>(r) == get<1>(r); }); } else { result = true; } diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index f7e2fab4411..be97da442a1 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -1128,9 +1128,8 @@ void sort_by_column_index(std::shared_ptr exec, auto row_nnz = row_ptrs[i + 1] - start_row_idx; auto it = detail::make_zip_iterator(col_idxs + start_row_idx, values + start_row_idx); - std::sort(it, it + row_nnz, [](auto t1, auto t2) { - return std::get<0>(t1) < std::get<0>(t2); - }); + std::sort(it, it + row_nnz, + [](auto t1, auto t2) { return get<0>(t1) < get<0>(t2); }); } } diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp index 9e60e380d9c..cdedc36ddc0 100644 --- a/reference/matrix/fbcsr_kernels.cpp +++ b/reference/matrix/fbcsr_kernels.cpp @@ -418,9 +418,8 @@ void sort_by_column_index_impl( std::vector col_permute(nbnz_brow); std::iota(col_permute.begin(), col_permute.end(), 0); auto it = detail::make_zip_iterator(brow_col_idxs, col_permute.data()); - std::sort(it, it + nbnz_brow, [](auto a, auto b) { - return std::get<0>(a) < std::get<0>(b); - }); + std::sort(it, it + nbnz_brow, + [](auto a, auto b) { return get<0>(a) < get<0>(b); }); std::vector oldvalues(nbnz_brow * bs2); std::copy(brow_vals, brow_vals + nbnz_brow * bs2, oldvalues.begin()); diff --git a/reference/multigrid/pgm_kernels.cpp b/reference/multigrid/pgm_kernels.cpp index 2a6e3252a9f..bff2a776c6b 100644 --- a/reference/multigrid/pgm_kernels.cpp +++ b/reference/multigrid/pgm_kernels.cpp @@ -270,8 +270,7 @@ void sort_row_major(std::shared_ptr exec, size_type nnz, { auto it = detail::make_zip_iterator(row_idxs, col_idxs, vals); std::stable_sort(it, it + nnz, [](auto a, auto b) { - return std::tie(std::get<0>(a), std::get<1>(a)) < - std::tie(std::get<0>(b), std::get<1>(b)); + return std::tie(get<0>(a), get<1>(a)) < std::tie(get<0>(b), get<1>(b)); }); } diff --git a/test/base/CMakeLists.txt b/test/base/CMakeLists.txt index d54996f212a..5f31c25db19 100644 --- a/test/base/CMakeLists.txt +++ b/test/base/CMakeLists.txt @@ -1,6 +1,7 @@ ginkgo_create_common_test(batch_multi_vector_kernels) ginkgo_create_common_and_reference_test(device_matrix_data_kernels) ginkgo_create_common_device_test(index_range) +ginkgo_create_common_device_test(iterator_factory) ginkgo_create_common_device_test(kernel_launch_generic) ginkgo_create_common_and_reference_test(executor) ginkgo_create_common_and_reference_test(timer) diff --git a/test/base/iterator_factory.cpp b/test/base/iterator_factory.cpp new file mode 100644 index 00000000000..5dc97646960 --- /dev/null +++ b/test/base/iterator_factory.cpp @@ -0,0 +1,69 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/base/iterator_factory.hpp" + +#include + +#include + +#include + +#include "common/unified/base/kernel_launch.hpp" +#include "core/test/utils.hpp" +#include "test/utils/executor.hpp" + + +class IteratorFactory : public CommonTestFixture { +public: + IteratorFactory() + : key_array{exec, {6, 2, 3, 8, 1, 0, 2}}, + value_array{exec, {9, 5, 7, 2, 4, 7, 2}}, + expected_key_array{ref, {7, 1, 2, 2, 3, 6, 8}}, + expected_value_array{ref, {7, 4, 2, 5, 7, 9, 2}} + {} + + gko::array key_array; + gko::array value_array; + gko::array expected_key_array; + gko::array expected_value_array; +}; + + +// nvcc doesn't like device lambdas declared in complex classes, move it out +void run_zip_iterator(std::shared_ptr exec, + gko::array& key_array, gko::array& value_array) +{ + gko::kernels::EXEC_NAMESPACE::run_kernel( + exec, + [] GKO_KERNEL(auto i, auto keys, auto values, auto size) { + auto begin = gko::detail::make_zip_iterator(keys, values); + auto end = begin + size; + using std::swap; + for (auto it = begin; it != end; ++it) { + auto min_it = it; + for (auto it2 = it; it2 != end; ++it2) { + if (*it2 < *min_it) { + min_it = it2; + } + } + swap(*it, *min_it); + } + // check structured bindings + auto [key, value] = *begin; + static_assert(std::is_same::value, + "incorrect type"); + gko::get<0>(*begin) = value; + }, + 1, key_array, value_array, static_cast(key_array.get_size())); +} + + +TEST_F(IteratorFactory, KernelRunsZipIterator) +{ + run_zip_iterator(exec, key_array, value_array); + + GKO_ASSERT_ARRAY_EQ(key_array, expected_key_array); + GKO_ASSERT_ARRAY_EQ(value_array, expected_value_array); +} From b617d9e10d93fcfaf849d9b936ca03e3c11efeb2 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 10 Jul 2024 13:32:44 +0200 Subject: [PATCH 083/448] fix issues after rebase --- omp/distributed/index_map_kernels.cpp | 13 ++++++------- test/base/iterator_factory.cpp | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/omp/distributed/index_map_kernels.cpp b/omp/distributed/index_map_kernels.cpp index b01dab9cb33..7374f7b978b 100644 --- a/omp/distributed/index_map_kernels.cpp +++ b/omp/distributed/index_map_kernels.cpp @@ -58,16 +58,15 @@ void build_mapping( auto sort_it = detail::make_zip_iterator( full_remote_part_ids.begin(), recv_connections_ptr, range_ids.begin()); std::sort(sort_it, sort_it + input_size, [](const auto& a, const auto& b) { - return std::tie(std::get<0>(a), std::get<1>(a)) < - std::tie(std::get<0>(b), std::get<1>(b)); + return std::tie(get<0>(a), get<1>(a)) < std::tie(get<0>(b), get<1>(b)); }); // get only unique connections - auto unique_end = std::unique( - sort_it, sort_it + input_size, [](const auto& a, const auto& b) { - return std::tie(std::get<0>(a), std::get<1>(a)) == - std::tie(std::get<0>(b), std::get<1>(b)); - }); + auto unique_end = std::unique(sort_it, sort_it + input_size, + [](const auto& a, const auto& b) { + return std::tie(get<0>(a), get<1>(a)) == + std::tie(get<0>(b), get<1>(b)); + }); auto unique_size = std::distance(sort_it, unique_end); remote_global_idxs.resize_and_reset(unique_size); diff --git a/test/base/iterator_factory.cpp b/test/base/iterator_factory.cpp index 5dc97646960..be51a2df32c 100644 --- a/test/base/iterator_factory.cpp +++ b/test/base/iterator_factory.cpp @@ -12,7 +12,7 @@ #include "common/unified/base/kernel_launch.hpp" #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class IteratorFactory : public CommonTestFixture { @@ -35,7 +35,7 @@ class IteratorFactory : public CommonTestFixture { void run_zip_iterator(std::shared_ptr exec, gko::array& key_array, gko::array& value_array) { - gko::kernels::EXEC_NAMESPACE::run_kernel( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( exec, [] GKO_KERNEL(auto i, auto keys, auto values, auto size) { auto begin = gko::detail::make_zip_iterator(keys, values); From ee8bb10aff292b56e9abe1203c50c0e8e96a81a1 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 11 Jul 2024 13:31:16 +0200 Subject: [PATCH 084/448] fixes for MSVC and nvc++ - MSVC finds the get(...) member function, so we need to call the free function explicitly - the structured bindings refer to a reference --- core/base/iterator_factory.hpp | 8 +++++--- test/base/iterator_factory.cpp | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp index 938d705b04d..de5af49e24f 100644 --- a/core/base/iterator_factory.hpp +++ b/core/base/iterator_factory.hpp @@ -13,6 +13,8 @@ #include #include +#include + #include "core/base/copy_assignable.hpp" @@ -290,7 +292,7 @@ class zip_iterator_reference // std::tuple t = { 1, '2' }; is not allowed. // converting to 'std::tuple<...>' from initializer list would use // explicit constructor - return value_type(get(*this)...); + return value_type(gko::get(*this)...); } template @@ -298,7 +300,7 @@ class zip_iterator_reference const value_type& other) { (void)std::initializer_list{ - (get(*this) = get(other), 0)...}; + (gko::get(*this) = gko::get(other), 0)...}; } constexpr explicit zip_iterator_reference(Iterators... it) @@ -509,7 +511,7 @@ class zip_iterator { auto other_it = get<0>(other.iterators_); auto result = fn(it, other_it); forall_impl( - other, [&](auto a, auto b) { assert(it - other_it == a - b); }, + other, [&](auto a, auto b) { GKO_ASSERT(it - other_it == a - b); }, index_sequence{}); return result; } diff --git a/test/base/iterator_factory.cpp b/test/base/iterator_factory.cpp index be51a2df32c..5826118fd81 100644 --- a/test/base/iterator_factory.cpp +++ b/test/base/iterator_factory.cpp @@ -52,7 +52,8 @@ void run_zip_iterator(std::shared_ptr exec, } // check structured bindings auto [key, value] = *begin; - static_assert(std::is_same::value, + static_assert(std::is_same, + int>::value, "incorrect type"); gko::get<0>(*begin) = value; }, From e96bb188ed669b278e5eb9c9f9813114a197db50 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 12 Jul 2024 13:52:47 +0200 Subject: [PATCH 085/448] unify IC and ILU --- common/cuda_hip/CMakeLists.txt | 2 + .../cuda_hip/factorization/ic_kernels.cpp | 12 ++-- .../cuda_hip/factorization/ilu_kernels.cpp | 12 ++-- cuda/CMakeLists.txt | 2 - hip/CMakeLists.txt | 2 - hip/factorization/ic_kernels.hip.cpp | 63 ------------------ hip/factorization/ilu_kernels.hip.cpp | 64 ------------------- 7 files changed, 10 insertions(+), 147 deletions(-) rename cuda/factorization/ic_kernels.cu => common/cuda_hip/factorization/ic_kernels.cpp (91%) rename cuda/factorization/ilu_kernels.cu => common/cuda_hip/factorization/ilu_kernels.cpp (91%) delete mode 100644 hip/factorization/ic_kernels.hip.cpp delete mode 100644 hip/factorization/ilu_kernels.hip.cpp diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index c18755ab164..4ae7c462b27 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -9,6 +9,8 @@ set(CUDA_HIP_SOURCES distributed/vector_kernels.cpp factorization/cholesky_kernels.cpp factorization/factorization_kernels.cpp + factorization/ic_kernels.cpp + factorization/ilu_kernels.cpp factorization/lu_kernels.cpp factorization/par_ic_kernels.cpp factorization/par_ilu_kernels.cpp diff --git a/cuda/factorization/ic_kernels.cu b/common/cuda_hip/factorization/ic_kernels.cpp similarity index 91% rename from cuda/factorization/ic_kernels.cu rename to common/cuda_hip/factorization/ic_kernels.cpp index 3a4b4a55411..62963c479bd 100644 --- a/cuda/factorization/ic_kernels.cu +++ b/common/cuda_hip/factorization/ic_kernels.cpp @@ -6,17 +6,13 @@ #include +#include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" namespace gko { namespace kernels { -namespace cuda { -/** - * @brief The ic factorization namespace. - * - * @ingroup factor - */ +namespace GKO_DEVICE_NAMESPACE { namespace ic_factorization { @@ -50,7 +46,7 @@ void compute(std::shared_ptr exec, SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); // CUDA 11.4 has a use-after-free bug on Turing -#if (CUDA_VERSION >= 11040) +#if defined(GKO_COMPILING_CUDA) && (CUDA_VERSION >= 11040) exec->synchronize(); #endif @@ -62,6 +58,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL); } // namespace ic_factorization -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/cuda/factorization/ilu_kernels.cu b/common/cuda_hip/factorization/ilu_kernels.cpp similarity index 91% rename from cuda/factorization/ilu_kernels.cu rename to common/cuda_hip/factorization/ilu_kernels.cpp index 6096e89ef4b..b3f959bba02 100644 --- a/cuda/factorization/ilu_kernels.cu +++ b/common/cuda_hip/factorization/ilu_kernels.cpp @@ -6,17 +6,13 @@ #include +#include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" namespace gko { namespace kernels { -namespace cuda { -/** - * @brief The ilu factorization namespace. - * - * @ingroup factor - */ +namespace GKO_DEVICE_NAMESPACE { namespace ilu_factorization { @@ -50,7 +46,7 @@ void compute_lu(std::shared_ptr exec, SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); // CUDA 11.4 has a use-after-free bug on Turing -#if (CUDA_VERSION >= 11040) +#if defined(GKO_BUILDING_CUDA) && (CUDA_VERSION >= 11040) exec->synchronize(); #endif @@ -63,6 +59,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace ilu_factorization -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 30b3f2747e6..89c711965e1 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -18,8 +18,6 @@ target_sources(ginkgo_cuda base/stream.cpp base/timer.cpp base/version.cpp - factorization/ic_kernels.cu - factorization/ilu_kernels.cu factorization/par_ict_kernels.cu factorization/par_ilut_approx_filter_kernels.cu factorization/par_ilut_filter_kernels.cu diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 23584c2742a..32e3767f93c 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -16,8 +16,6 @@ set(GINKGO_HIP_SOURCES base/stream.hip.cpp base/timer.hip.cpp base/version.hip.cpp - factorization/ic_kernels.hip.cpp - factorization/ilu_kernels.hip.cpp factorization/par_ict_kernels.hip.cpp factorization/par_ilut_approx_filter_kernels.hip.cpp factorization/par_ilut_filter_kernels.hip.cpp diff --git a/hip/factorization/ic_kernels.hip.cpp b/hip/factorization/ic_kernels.hip.cpp deleted file mode 100644 index cfbb12bd5b3..00000000000 --- a/hip/factorization/ic_kernels.hip.cpp +++ /dev/null @@ -1,63 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/ic_kernels.hpp" - -#include - -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/sparselib_bindings.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The ic factorization namespace. - * - * @ingroup factor - */ -namespace ic_factorization { - - -template -void compute(std::shared_ptr exec, - matrix::Csr* m) -{ - const auto id = exec->get_device_id(); - auto handle = exec->get_sparselib_handle(); - auto desc = sparselib::create_mat_descr(); - auto info = sparselib::create_ic0_info(); - - // get buffer size for IC - IndexType num_rows = m->get_size()[0]; - IndexType nnz = m->get_num_stored_elements(); - size_type buffer_size{}; - sparselib::ic0_buffer_size(handle, num_rows, nnz, desc, - m->get_const_values(), m->get_const_row_ptrs(), - m->get_const_col_idxs(), info, buffer_size); - - array buffer{exec, buffer_size}; - - // set up IC(0) - sparselib::ic0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), - m->get_const_row_ptrs(), m->get_const_col_idxs(), - info, SPARSELIB_SOLVE_POLICY_USE_LEVEL, - buffer.get_data()); - - sparselib::ic0(handle, num_rows, nnz, desc, m->get_values(), - m->get_const_row_ptrs(), m->get_const_col_idxs(), info, - SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); - - sparselib::destroy_ic0_info(info); - sparselib::destroy(desc); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL); - - -} // namespace ic_factorization -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/factorization/ilu_kernels.hip.cpp b/hip/factorization/ilu_kernels.hip.cpp deleted file mode 100644 index 45d468d0500..00000000000 --- a/hip/factorization/ilu_kernels.hip.cpp +++ /dev/null @@ -1,64 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/ilu_kernels.hpp" - -#include - -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/sparselib_bindings.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The ilu factorization namespace. - * - * @ingroup factor - */ -namespace ilu_factorization { - - -template -void compute_lu(std::shared_ptr exec, - matrix::Csr* m) -{ - const auto id = exec->get_device_id(); - auto handle = exec->get_sparselib_handle(); - auto desc = sparselib::create_mat_descr(); - auto info = sparselib::create_ilu0_info(); - - // get buffer size for ILU - IndexType num_rows = m->get_size()[0]; - IndexType nnz = m->get_num_stored_elements(); - size_type buffer_size{}; - sparselib::ilu0_buffer_size(handle, num_rows, nnz, desc, - m->get_const_values(), m->get_const_row_ptrs(), - m->get_const_col_idxs(), info, buffer_size); - - array buffer{exec, buffer_size}; - - // set up ILU(0) - sparselib::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), - m->get_const_row_ptrs(), m->get_const_col_idxs(), - info, SPARSELIB_SOLVE_POLICY_USE_LEVEL, - buffer.get_data()); - - sparselib::ilu0(handle, num_rows, nnz, desc, m->get_values(), - m->get_const_row_ptrs(), m->get_const_col_idxs(), info, - SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); - - sparselib::destroy_ilu0_info(info); - sparselib::destroy(desc); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ILU_COMPUTE_LU_KERNEL); - - -} // namespace ilu_factorization -} // namespace hip -} // namespace kernels -} // namespace gko From e268e29fbcd49062727efbaa79f88aa4f89d0af2 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 12 Jul 2024 14:16:31 +0200 Subject: [PATCH 086/448] unify most of ParILUT/ICT --- common/cuda_hip/CMakeLists.txt | 3 + ...ct_kernels.hpp.inc => par_ict_kernels.cpp} | 182 +++++++++++++++++ ...ls.hpp.inc => par_ilut_spgeam_kernels.cpp} | 151 ++++++++++++++ .../factorization/par_ilut_sweep_kernels.cpp | 97 ++++++++- .../par_ilut_sweep_kernels.hpp.inc | 94 --------- cuda/CMakeLists.txt | 3 - cuda/factorization/par_ict_kernels.cu | 187 ------------------ cuda/factorization/par_ilut_spgeam_kernels.cu | 156 --------------- hip/CMakeLists.txt | 3 - hip/factorization/par_ict_kernels.hip.cpp | 187 ------------------ .../par_ilut_spgeam_kernels.hip.cpp | 156 --------------- .../par_ilut_sweep_kernels.hip.cpp | 120 ----------- 12 files changed, 429 insertions(+), 910 deletions(-) rename common/cuda_hip/factorization/{par_ict_kernels.hpp.inc => par_ict_kernels.cpp} (62%) rename common/cuda_hip/factorization/{par_ilut_spgeam_kernels.hpp.inc => par_ilut_spgeam_kernels.cpp} (63%) rename cuda/factorization/par_ilut_sweep_kernels.cu => common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp (54%) delete mode 100644 common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc delete mode 100644 cuda/factorization/par_ict_kernels.cu delete mode 100644 cuda/factorization/par_ilut_spgeam_kernels.cu delete mode 100644 hip/factorization/par_ict_kernels.hip.cpp delete mode 100644 hip/factorization/par_ilut_spgeam_kernels.hip.cpp delete mode 100644 hip/factorization/par_ilut_sweep_kernels.hip.cpp diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index 4ae7c462b27..c4a56482b1d 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -13,7 +13,10 @@ set(CUDA_HIP_SOURCES factorization/ilu_kernels.cpp factorization/lu_kernels.cpp factorization/par_ic_kernels.cpp + factorization/par_ict_kernels.cpp factorization/par_ilu_kernels.cpp + factorization/par_ilut_spgeam_kernels.cpp + factorization/par_ilut_sweep_kernels.cpp matrix/coo_kernels.cpp matrix/dense_kernels.cpp matrix/diagonal_kernels.cpp diff --git a/common/cuda_hip/factorization/par_ict_kernels.hpp.inc b/common/cuda_hip/factorization/par_ict_kernels.cpp similarity index 62% rename from common/cuda_hip/factorization/par_ict_kernels.hpp.inc rename to common/cuda_hip/factorization/par_ict_kernels.cpp index 87aa8297345..94aa5e5124e 100644 --- a/common/cuda_hip/factorization/par_ict_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ict_kernels.cpp @@ -2,6 +2,49 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/factorization/par_ict_kernels.hpp" + +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/memory.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The parallel ICT factorization namespace. + * + * @ingroup factor + */ +namespace par_ict_factorization { + + +constexpr int default_block_size = 512; + + +// subwarp sizes for all warp-parallel kernels (filter, add_candidates) +using compiled_kernels = + syn::value_list; + + namespace kernel { @@ -275,3 +318,142 @@ __global__ __launch_bounds__(default_block_size) void ict_sweep( } // namespace kernel + + +namespace { + + +template +void add_candidates(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* llh, + const matrix::Csr* a, + const matrix::Csr* l, + matrix::Csr* l_new) +{ + auto num_rows = static_cast(llh->get_size()[0]); + auto subwarps_per_block = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, subwarps_per_block); + matrix::CsrBuilder l_new_builder(l_new); + auto llh_row_ptrs = llh->get_const_row_ptrs(); + auto llh_col_idxs = llh->get_const_col_idxs(); + auto llh_vals = llh->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_const_values(); + auto l_new_row_ptrs = l_new->get_row_ptrs(); + // count non-zeros per row + if (num_blocks > 0) { + kernel::ict_tri_spgeam_nnz + <<get_stream()>>>( + llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs, + l_new_row_ptrs, num_rows); + } + + // build row ptrs + components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1); + + // resize output arrays + auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); + l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); + l_new_builder.get_value_array().resize_and_reset(l_new_nnz); + + auto l_new_col_idxs = l_new->get_col_idxs(); + auto l_new_vals = l_new->get_values(); + + // fill columns and values + if (num_blocks > 0) { + kernel::ict_tri_spgeam_init + <<get_stream()>>>( + llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals), + a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs, + l_col_idxs, as_device_type(l_vals), l_new_row_ptrs, + l_new_col_idxs, as_device_type(l_new_vals), num_rows); + } +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); + + +template +void compute_factor(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* a, + matrix::Csr* l, + const matrix::Coo* l_coo) +{ + auto total_nnz = static_cast(l->get_num_stored_elements()); + auto block_size = default_block_size / subwarp_size; + auto num_blocks = ceildiv(total_nnz, block_size); + if (num_blocks > 0) { + kernel::ict_sweep + <<get_stream()>>>( + a->get_const_row_ptrs(), a->get_const_col_idxs(), + as_device_type(a->get_const_values()), l->get_const_row_ptrs(), + l_coo->get_const_row_idxs(), l->get_const_col_idxs(), + as_device_type(l->get_values()), + static_cast(l->get_num_stored_elements())); + } +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor); + + +} // namespace + + +template +void add_candidates(std::shared_ptr exec, + const matrix::Csr* llh, + const matrix::Csr* a, + const matrix::Csr* l, + matrix::Csr* l_new) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = + llh->get_num_stored_elements() + a->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_add_candidates( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, llh, a, l, l_new); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); + + +template +void compute_factor(std::shared_ptr exec, + const matrix::Csr* a, + matrix::Csr* l, + const matrix::Coo* l_coo) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = 2 * l->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_compute_factor( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, a, l, l_coo); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); + + +} // namespace par_ict_factorization +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp similarity index 63% rename from common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc rename to common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp index a97f0f08937..6cc77660394 100644 --- a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp @@ -2,6 +2,47 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/par_ilut_kernels.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +constexpr int default_block_size = 512; + + +// subwarp sizes for add_candidates kernels +using compiled_kernels = + syn::value_list; + + namespace kernel { @@ -246,3 +287,113 @@ __global__ __launch_bounds__(default_block_size) void tri_spgeam_init( } // namespace kernel + + +namespace { + + +template +void add_candidates(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* lu, + const matrix::Csr* a, + const matrix::Csr* l, + const matrix::Csr* u, + matrix::Csr* l_new, + matrix::Csr* u_new) +{ + auto num_rows = static_cast(lu->get_size()[0]); + auto subwarps_per_block = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, subwarps_per_block); + matrix::CsrBuilder l_new_builder(l_new); + matrix::CsrBuilder u_new_builder(u_new); + auto lu_row_ptrs = lu->get_const_row_ptrs(); + auto lu_col_idxs = lu->get_const_col_idxs(); + auto lu_vals = lu->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_const_values(); + auto u_row_ptrs = u->get_const_row_ptrs(); + auto u_col_idxs = u->get_const_col_idxs(); + auto u_vals = u->get_const_values(); + auto l_new_row_ptrs = l_new->get_row_ptrs(); + auto u_new_row_ptrs = u_new->get_row_ptrs(); + if (num_blocks > 0) { + // count non-zeros per row + kernel::tri_spgeam_nnz + <<get_stream()>>>( + lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs, + l_new_row_ptrs, u_new_row_ptrs, num_rows); + } + + // build row ptrs + components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1); + components::prefix_sum_nonnegative(exec, u_new_row_ptrs, num_rows + 1); + + // resize output arrays + auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); + auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows); + l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); + l_new_builder.get_value_array().resize_and_reset(l_new_nnz); + u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz); + u_new_builder.get_value_array().resize_and_reset(u_new_nnz); + + auto l_new_col_idxs = l_new->get_col_idxs(); + auto l_new_vals = l_new->get_values(); + auto u_new_col_idxs = u_new->get_col_idxs(); + auto u_new_vals = u_new->get_values(); + + if (num_blocks > 0) { + // fill columns and values + kernel::tri_spgeam_init + <<get_stream()>>>( + lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs, + a_col_idxs, as_device_type(a_vals), l_row_ptrs, l_col_idxs, + as_device_type(l_vals), u_row_ptrs, u_col_idxs, + as_device_type(u_vals), l_new_row_ptrs, l_new_col_idxs, + as_device_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs, + as_device_type(u_new_vals), num_rows); + } +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); + + +} // namespace + + +template +void add_candidates(std::shared_ptr exec, + const matrix::Csr* lu, + const matrix::Csr* a, + const matrix::Csr* l, + const matrix::Csr* u, + matrix::Csr* l_new, + matrix::Csr* u_new) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = + lu->get_num_stored_elements() + a->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_add_candidates( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, lu, a, l, u, l_new, + u_new); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/cuda/factorization/par_ilut_sweep_kernels.cu b/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp similarity index 54% rename from cuda/factorization/par_ilut_sweep_kernels.cu rename to common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp index 9e277549aa4..52f62b50e6a 100644 --- a/cuda/factorization/par_ilut_sweep_kernels.cu +++ b/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp @@ -27,7 +27,7 @@ namespace gko { namespace kernels { -namespace cuda { +namespace GKO_DEVICE_NAMESPACE { /** * @brief The parallel ILUT factorization namespace. * @@ -44,7 +44,96 @@ using compiled_kernels = syn::value_list; -#include "common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc" +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void sweep( + const IndexType* __restrict__ a_row_ptrs, + const IndexType* __restrict__ a_col_idxs, + const ValueType* __restrict__ a_vals, + const IndexType* __restrict__ l_row_ptrs, + const IndexType* __restrict__ l_row_idxs, + const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals, + IndexType l_nnz, const IndexType* __restrict__ u_row_idxs, + const IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_vals, + const IndexType* __restrict__ ut_col_ptrs, + const IndexType* __restrict__ ut_row_idxs, ValueType* __restrict__ ut_vals, + IndexType u_nnz) +{ + auto tidx = thread::get_subwarp_id_flat(); + if (tidx >= l_nnz + u_nnz) { + return; + } + // split the subwarps into two halves for lower and upper triangle + auto l_nz = tidx; + auto u_nz = l_nz - l_nnz; + auto lower = u_nz < 0; + auto row = lower ? l_row_idxs[l_nz] : u_row_idxs[u_nz]; + auto col = lower ? l_col_idxs[l_nz] : u_col_idxs[u_nz]; + if (lower && row == col) { + // don't update the diagonal twice + return; + } + auto subwarp = + group::tiled_partition(group::this_thread_block()); + // find entry of A at (row, col) + auto a_row_begin = a_row_ptrs[row]; + auto a_row_end = a_row_ptrs[row + 1]; + auto a_row_size = a_row_end - a_row_begin; + auto a_idx = + group_wide_search(a_row_begin, a_row_size, subwarp, + [&](IndexType i) { return a_col_idxs[i] >= col; }); + bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col; + auto a_val = has_a ? a_vals[a_idx] : zero(); + auto l_row_begin = l_row_ptrs[row]; + auto l_row_size = l_row_ptrs[row + 1] - l_row_begin; + auto ut_col_begin = ut_col_ptrs[col]; + auto ut_col_size = ut_col_ptrs[col + 1] - ut_col_begin; + ValueType sum{}; + IndexType ut_nz{}; + auto last_entry = min(row, col); + group_merge( + l_col_idxs + l_row_begin, l_row_size, ut_row_idxs + ut_col_begin, + ut_col_size, subwarp, + [&](IndexType l_idx, IndexType l_col, IndexType ut_idx, + IndexType ut_row, IndexType, bool) { + // we don't need to use the `bool valid` because last_entry is + // already a smaller sentinel value than the one used in group_merge + if (l_col == ut_row && l_col < last_entry) { + sum += load_relaxed(l_vals + (l_idx + l_row_begin)) * + load_relaxed(ut_vals + (ut_idx + ut_col_begin)); + } + // remember the transposed element + auto found_transp = subwarp.ballot(ut_row == row); + if (found_transp) { + ut_nz = + subwarp.shfl(ut_idx + ut_col_begin, ffs(found_transp) - 1); + } + return true; + }); + // accumulate result from all threads + sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; }); + + if (subwarp.thread_rank() == 0) { + if (lower) { + auto to_write = (a_val - sum) / + load_relaxed(ut_vals + (ut_col_ptrs[col + 1] - 1)); + if (is_finite(to_write)) { + store_relaxed(l_vals + l_nz, to_write); + } + } else { + auto to_write = a_val - sum; + if (is_finite(to_write)) { + store_relaxed(u_vals + u_nz, to_write); + store_relaxed(ut_vals + ut_nz, to_write); + } + } + } +} + + +} // namespace kernel namespace { @@ -115,6 +204,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace par_ilut_factorization -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels -} // namespace gko +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc deleted file mode 100644 index 9da94a878b3..00000000000 --- a/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc +++ /dev/null @@ -1,94 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void sweep( - const IndexType* __restrict__ a_row_ptrs, - const IndexType* __restrict__ a_col_idxs, - const ValueType* __restrict__ a_vals, - const IndexType* __restrict__ l_row_ptrs, - const IndexType* __restrict__ l_row_idxs, - const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals, - IndexType l_nnz, const IndexType* __restrict__ u_row_idxs, - const IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_vals, - const IndexType* __restrict__ ut_col_ptrs, - const IndexType* __restrict__ ut_row_idxs, ValueType* __restrict__ ut_vals, - IndexType u_nnz) -{ - auto tidx = thread::get_subwarp_id_flat(); - if (tidx >= l_nnz + u_nnz) { - return; - } - // split the subwarps into two halves for lower and upper triangle - auto l_nz = tidx; - auto u_nz = l_nz - l_nnz; - auto lower = u_nz < 0; - auto row = lower ? l_row_idxs[l_nz] : u_row_idxs[u_nz]; - auto col = lower ? l_col_idxs[l_nz] : u_col_idxs[u_nz]; - if (lower && row == col) { - // don't update the diagonal twice - return; - } - auto subwarp = - group::tiled_partition(group::this_thread_block()); - // find entry of A at (row, col) - auto a_row_begin = a_row_ptrs[row]; - auto a_row_end = a_row_ptrs[row + 1]; - auto a_row_size = a_row_end - a_row_begin; - auto a_idx = - group_wide_search(a_row_begin, a_row_size, subwarp, - [&](IndexType i) { return a_col_idxs[i] >= col; }); - bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col; - auto a_val = has_a ? a_vals[a_idx] : zero(); - auto l_row_begin = l_row_ptrs[row]; - auto l_row_size = l_row_ptrs[row + 1] - l_row_begin; - auto ut_col_begin = ut_col_ptrs[col]; - auto ut_col_size = ut_col_ptrs[col + 1] - ut_col_begin; - ValueType sum{}; - IndexType ut_nz{}; - auto last_entry = min(row, col); - group_merge( - l_col_idxs + l_row_begin, l_row_size, ut_row_idxs + ut_col_begin, - ut_col_size, subwarp, - [&](IndexType l_idx, IndexType l_col, IndexType ut_idx, - IndexType ut_row, IndexType, bool) { - // we don't need to use the `bool valid` because last_entry is - // already a smaller sentinel value than the one used in group_merge - if (l_col == ut_row && l_col < last_entry) { - sum += load_relaxed(l_vals + (l_idx + l_row_begin)) * - load_relaxed(ut_vals + (ut_idx + ut_col_begin)); - } - // remember the transposed element - auto found_transp = subwarp.ballot(ut_row == row); - if (found_transp) { - ut_nz = - subwarp.shfl(ut_idx + ut_col_begin, ffs(found_transp) - 1); - } - return true; - }); - // accumulate result from all threads - sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; }); - - if (subwarp.thread_rank() == 0) { - if (lower) { - auto to_write = (a_val - sum) / - load_relaxed(ut_vals + (ut_col_ptrs[col + 1] - 1)); - if (is_finite(to_write)) { - store_relaxed(l_vals + l_nz, to_write); - } - } else { - auto to_write = a_val - sum; - if (is_finite(to_write)) { - store_relaxed(u_vals + u_nz, to_write); - store_relaxed(ut_vals + ut_nz, to_write); - } - } - } -} - - -} // namespace kernel diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 89c711965e1..ae506faed4b 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -18,13 +18,10 @@ target_sources(ginkgo_cuda base/stream.cpp base/timer.cpp base/version.cpp - factorization/par_ict_kernels.cu factorization/par_ilut_approx_filter_kernels.cu factorization/par_ilut_filter_kernels.cu factorization/par_ilut_select_common.cu factorization/par_ilut_select_kernels.cu - factorization/par_ilut_spgeam_kernels.cu - factorization/par_ilut_sweep_kernels.cu matrix/batch_csr_kernels.cu matrix/batch_dense_kernels.cu matrix/batch_ell_kernels.cu diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu deleted file mode 100644 index 62964925aa4..00000000000 --- a/cuda/factorization/par_ict_kernels.cu +++ /dev/null @@ -1,187 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ict_kernels.hpp" - -#include -#include -#include -#include -#include - -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/intrinsics.hpp" -#include "common/cuda_hip/components/memory.hpp" -#include "common/cuda_hip/components/merging.hpp" -#include "common/cuda_hip/components/prefix_sum.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/searching.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/coo_builder.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The parallel ICT factorization namespace. - * - * @ingroup factor - */ -namespace par_ict_factorization { - - -constexpr int default_block_size = 512; - - -// subwarp sizes for all warp-parallel kernels (filter, add_candidates) -using compiled_kernels = - syn::value_list; - - -#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc" - - -namespace { - - -template -void add_candidates(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* llh, - const matrix::Csr* a, - const matrix::Csr* l, - matrix::Csr* l_new) -{ - auto num_rows = static_cast(llh->get_size()[0]); - auto subwarps_per_block = default_block_size / subwarp_size; - auto num_blocks = ceildiv(num_rows, subwarps_per_block); - matrix::CsrBuilder l_new_builder(l_new); - auto llh_row_ptrs = llh->get_const_row_ptrs(); - auto llh_col_idxs = llh->get_const_col_idxs(); - auto llh_vals = llh->get_const_values(); - auto a_row_ptrs = a->get_const_row_ptrs(); - auto a_col_idxs = a->get_const_col_idxs(); - auto a_vals = a->get_const_values(); - auto l_row_ptrs = l->get_const_row_ptrs(); - auto l_col_idxs = l->get_const_col_idxs(); - auto l_vals = l->get_const_values(); - auto l_new_row_ptrs = l_new->get_row_ptrs(); - // count non-zeros per row - if (num_blocks > 0) { - kernel::ict_tri_spgeam_nnz - <<get_stream()>>>( - llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs, - l_new_row_ptrs, num_rows); - } - - // build row ptrs - components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1); - - // resize output arrays - auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); - l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); - l_new_builder.get_value_array().resize_and_reset(l_new_nnz); - - auto l_new_col_idxs = l_new->get_col_idxs(); - auto l_new_vals = l_new->get_values(); - - // fill columns and values - if (num_blocks > 0) { - kernel::ict_tri_spgeam_init - <<get_stream()>>>( - llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals), - a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs, - l_col_idxs, as_device_type(l_vals), l_new_row_ptrs, - l_new_col_idxs, as_device_type(l_new_vals), num_rows); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); - - -template -void compute_factor(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* a, - matrix::Csr* l, - const matrix::Coo* l_coo) -{ - auto total_nnz = static_cast(l->get_num_stored_elements()); - auto block_size = default_block_size / subwarp_size; - auto num_blocks = ceildiv(total_nnz, block_size); - if (num_blocks > 0) { - kernel::ict_sweep - <<get_stream()>>>( - a->get_const_row_ptrs(), a->get_const_col_idxs(), - as_device_type(a->get_const_values()), l->get_const_row_ptrs(), - l_coo->get_const_row_idxs(), l->get_const_col_idxs(), - as_device_type(l->get_values()), - static_cast(l->get_num_stored_elements())); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor); - - -} // namespace - - -template -void add_candidates(std::shared_ptr exec, - const matrix::Csr* llh, - const matrix::Csr* a, - const matrix::Csr* l, - matrix::Csr* l_new) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = - llh->get_num_stored_elements() + a->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_add_candidates( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, llh, a, l, l_new); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); - - -template -void compute_factor(std::shared_ptr exec, - const matrix::Csr* a, - matrix::Csr* l, - const matrix::Coo* l_coo) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = 2 * l->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_compute_factor( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, a, l, l_coo); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); - - -} // namespace par_ict_factorization -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/factorization/par_ilut_spgeam_kernels.cu b/cuda/factorization/par_ilut_spgeam_kernels.cu deleted file mode 100644 index 7277093314a..00000000000 --- a/cuda/factorization/par_ilut_spgeam_kernels.cu +++ /dev/null @@ -1,156 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include -#include -#include -#include -#include - -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/intrinsics.hpp" -#include "common/cuda_hip/components/merging.hpp" -#include "common/cuda_hip/components/prefix_sum.hpp" -#include "common/cuda_hip/components/searching.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/factorization/par_ilut_kernels.hpp" -#include "core/matrix/coo_builder.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The parallel ILUT factorization namespace. - * - * @ingroup factor - */ -namespace par_ilut_factorization { - - -constexpr int default_block_size = 512; - - -// subwarp sizes for add_candidates kernels -using compiled_kernels = - syn::value_list; - - -#include "common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc" - - -namespace { - - -template -void add_candidates(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* lu, - const matrix::Csr* a, - const matrix::Csr* l, - const matrix::Csr* u, - matrix::Csr* l_new, - matrix::Csr* u_new) -{ - auto num_rows = static_cast(lu->get_size()[0]); - auto subwarps_per_block = default_block_size / subwarp_size; - auto num_blocks = ceildiv(num_rows, subwarps_per_block); - matrix::CsrBuilder l_new_builder(l_new); - matrix::CsrBuilder u_new_builder(u_new); - auto lu_row_ptrs = lu->get_const_row_ptrs(); - auto lu_col_idxs = lu->get_const_col_idxs(); - auto lu_vals = lu->get_const_values(); - auto a_row_ptrs = a->get_const_row_ptrs(); - auto a_col_idxs = a->get_const_col_idxs(); - auto a_vals = a->get_const_values(); - auto l_row_ptrs = l->get_const_row_ptrs(); - auto l_col_idxs = l->get_const_col_idxs(); - auto l_vals = l->get_const_values(); - auto u_row_ptrs = u->get_const_row_ptrs(); - auto u_col_idxs = u->get_const_col_idxs(); - auto u_vals = u->get_const_values(); - auto l_new_row_ptrs = l_new->get_row_ptrs(); - auto u_new_row_ptrs = u_new->get_row_ptrs(); - if (num_blocks > 0) { - // count non-zeros per row - kernel::tri_spgeam_nnz - <<get_stream()>>>( - lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs, - l_new_row_ptrs, u_new_row_ptrs, num_rows); - } - - // build row ptrs - components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1); - components::prefix_sum_nonnegative(exec, u_new_row_ptrs, num_rows + 1); - - // resize output arrays - auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); - auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows); - l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); - l_new_builder.get_value_array().resize_and_reset(l_new_nnz); - u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz); - u_new_builder.get_value_array().resize_and_reset(u_new_nnz); - - auto l_new_col_idxs = l_new->get_col_idxs(); - auto l_new_vals = l_new->get_values(); - auto u_new_col_idxs = u_new->get_col_idxs(); - auto u_new_vals = u_new->get_values(); - - if (num_blocks > 0) { - // fill columns and values - kernel::tri_spgeam_init - <<get_stream()>>>( - lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs, - a_col_idxs, as_device_type(a_vals), l_row_ptrs, l_col_idxs, - as_device_type(l_vals), u_row_ptrs, u_col_idxs, - as_device_type(u_vals), l_new_row_ptrs, l_new_col_idxs, - as_device_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs, - as_device_type(u_new_vals), num_rows); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); - - -} // namespace - - -template -void add_candidates(std::shared_ptr exec, - const matrix::Csr* lu, - const matrix::Csr* a, - const matrix::Csr* l, - const matrix::Csr* u, - matrix::Csr* l_new, - matrix::Csr* u_new) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = - lu->get_num_stored_elements() + a->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_add_candidates( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, lu, a, l, u, l_new, - u_new); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); - - -} // namespace par_ilut_factorization -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 32e3767f93c..0bfe56d7db1 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -16,13 +16,10 @@ set(GINKGO_HIP_SOURCES base/stream.hip.cpp base/timer.hip.cpp base/version.hip.cpp - factorization/par_ict_kernels.hip.cpp factorization/par_ilut_approx_filter_kernels.hip.cpp factorization/par_ilut_filter_kernels.hip.cpp factorization/par_ilut_select_common.hip.cpp factorization/par_ilut_select_kernels.hip.cpp - factorization/par_ilut_spgeam_kernels.hip.cpp - factorization/par_ilut_sweep_kernels.hip.cpp matrix/batch_csr_kernels.hip.cpp matrix/batch_dense_kernels.hip.cpp matrix/batch_ell_kernels.hip.cpp diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp deleted file mode 100644 index ed7b104471b..00000000000 --- a/hip/factorization/par_ict_kernels.hip.cpp +++ /dev/null @@ -1,187 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ict_kernels.hpp" - -#include -#include -#include -#include -#include - -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/intrinsics.hpp" -#include "common/cuda_hip/components/memory.hpp" -#include "common/cuda_hip/components/merging.hpp" -#include "common/cuda_hip/components/prefix_sum.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/searching.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/coo_builder.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The parallel ICT factorization namespace. - * - * @ingroup factor - */ -namespace par_ict_factorization { - - -constexpr int default_block_size = 512; - - -// subwarp sizes for all warp-parallel kernels (filter, add_candidates) -using compiled_kernels = - syn::value_list; - - -#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc" - - -namespace { - - -template -void add_candidates(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* llh, - const matrix::Csr* a, - const matrix::Csr* l, - matrix::Csr* l_new) -{ - auto num_rows = static_cast(llh->get_size()[0]); - auto subwarps_per_block = default_block_size / subwarp_size; - auto num_blocks = ceildiv(num_rows, subwarps_per_block); - matrix::CsrBuilder l_new_builder(l_new); - auto llh_row_ptrs = llh->get_const_row_ptrs(); - auto llh_col_idxs = llh->get_const_col_idxs(); - auto llh_vals = llh->get_const_values(); - auto a_row_ptrs = a->get_const_row_ptrs(); - auto a_col_idxs = a->get_const_col_idxs(); - auto a_vals = a->get_const_values(); - auto l_row_ptrs = l->get_const_row_ptrs(); - auto l_col_idxs = l->get_const_col_idxs(); - auto l_vals = l->get_const_values(); - auto l_new_row_ptrs = l_new->get_row_ptrs(); - // count non-zeros per row - if (num_blocks > 0) { - kernel::ict_tri_spgeam_nnz - <<get_stream()>>>( - llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs, - l_new_row_ptrs, num_rows); - } - - // build row ptrs - components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1); - - // resize output arrays - auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); - l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); - l_new_builder.get_value_array().resize_and_reset(l_new_nnz); - - auto l_new_col_idxs = l_new->get_col_idxs(); - auto l_new_vals = l_new->get_values(); - - // fill columns and values - if (num_blocks > 0) { - kernel::ict_tri_spgeam_init - <<get_stream()>>>( - llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals), - a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs, - l_col_idxs, as_device_type(l_vals), l_new_row_ptrs, - l_new_col_idxs, as_device_type(l_new_vals), num_rows); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); - - -template -void compute_factor(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* a, - matrix::Csr* l, - const matrix::Coo* l_coo) -{ - auto total_nnz = static_cast(l->get_num_stored_elements()); - auto block_size = default_block_size / subwarp_size; - auto num_blocks = ceildiv(total_nnz, block_size); - if (num_blocks > 0) { - kernel::ict_sweep - <<get_stream()>>>( - a->get_const_row_ptrs(), a->get_const_col_idxs(), - as_device_type(a->get_const_values()), l->get_const_row_ptrs(), - l_coo->get_const_row_idxs(), l->get_const_col_idxs(), - as_device_type(l->get_values()), - static_cast(l->get_num_stored_elements())); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor); - - -} // namespace - - -template -void add_candidates(std::shared_ptr exec, - const matrix::Csr* llh, - const matrix::Csr* a, - const matrix::Csr* l, - matrix::Csr* l_new) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = - llh->get_num_stored_elements() + a->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_add_candidates( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, llh, a, l, l_new); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); - - -template -void compute_factor(std::shared_ptr exec, - const matrix::Csr* a, - matrix::Csr* l, - const matrix::Coo* l_coo) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = 2 * l->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_compute_factor( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, a, l, l_coo); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); - - -} // namespace par_ict_factorization -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/factorization/par_ilut_spgeam_kernels.hip.cpp b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp deleted file mode 100644 index 5757e00d2a3..00000000000 --- a/hip/factorization/par_ilut_spgeam_kernels.hip.cpp +++ /dev/null @@ -1,156 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include -#include -#include -#include -#include - -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/intrinsics.hpp" -#include "common/cuda_hip/components/merging.hpp" -#include "common/cuda_hip/components/prefix_sum.hpp" -#include "common/cuda_hip/components/searching.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/factorization/par_ilut_kernels.hpp" -#include "core/matrix/coo_builder.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The parallel ILUT factorization namespace. - * - * @ingroup factor - */ -namespace par_ilut_factorization { - - -constexpr int default_block_size = 512; - - -// subwarp sizes for add_candidates kernels -using compiled_kernels = - syn::value_list; - - -#include "common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc" - - -namespace { - - -template -void add_candidates(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* lu, - const matrix::Csr* a, - const matrix::Csr* l, - const matrix::Csr* u, - matrix::Csr* l_new, - matrix::Csr* u_new) -{ - auto num_rows = static_cast(lu->get_size()[0]); - auto subwarps_per_block = default_block_size / subwarp_size; - auto num_blocks = ceildiv(num_rows, subwarps_per_block); - matrix::CsrBuilder l_new_builder(l_new); - matrix::CsrBuilder u_new_builder(u_new); - auto lu_row_ptrs = lu->get_const_row_ptrs(); - auto lu_col_idxs = lu->get_const_col_idxs(); - auto lu_vals = lu->get_const_values(); - auto a_row_ptrs = a->get_const_row_ptrs(); - auto a_col_idxs = a->get_const_col_idxs(); - auto a_vals = a->get_const_values(); - auto l_row_ptrs = l->get_const_row_ptrs(); - auto l_col_idxs = l->get_const_col_idxs(); - auto l_vals = l->get_const_values(); - auto u_row_ptrs = u->get_const_row_ptrs(); - auto u_col_idxs = u->get_const_col_idxs(); - auto u_vals = u->get_const_values(); - auto l_new_row_ptrs = l_new->get_row_ptrs(); - auto u_new_row_ptrs = u_new->get_row_ptrs(); - if (num_blocks > 0) { - // count non-zeros per row - kernel::tri_spgeam_nnz - <<get_stream()>>>( - lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs, - l_new_row_ptrs, u_new_row_ptrs, num_rows); - } - - // build row ptrs - components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1); - components::prefix_sum_nonnegative(exec, u_new_row_ptrs, num_rows + 1); - - // resize output arrays - auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); - auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows); - l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); - l_new_builder.get_value_array().resize_and_reset(l_new_nnz); - u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz); - u_new_builder.get_value_array().resize_and_reset(u_new_nnz); - - auto l_new_col_idxs = l_new->get_col_idxs(); - auto l_new_vals = l_new->get_values(); - auto u_new_col_idxs = u_new->get_col_idxs(); - auto u_new_vals = u_new->get_values(); - - if (num_blocks > 0) { - // fill columns and values - kernel::tri_spgeam_init - <<get_stream()>>>( - lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs, - a_col_idxs, as_device_type(a_vals), l_row_ptrs, l_col_idxs, - as_device_type(l_vals), u_row_ptrs, u_col_idxs, - as_device_type(u_vals), l_new_row_ptrs, l_new_col_idxs, - as_device_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs, - as_device_type(u_new_vals), num_rows); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); - - -} // namespace - - -template -void add_candidates(std::shared_ptr exec, - const matrix::Csr* lu, - const matrix::Csr* a, - const matrix::Csr* l, - const matrix::Csr* u, - matrix::Csr* l_new, - matrix::Csr* u_new) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = - lu->get_num_stored_elements() + a->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_add_candidates( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, lu, a, l, u, l_new, - u_new); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); - - -} // namespace par_ilut_factorization -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/factorization/par_ilut_sweep_kernels.hip.cpp b/hip/factorization/par_ilut_sweep_kernels.hip.cpp deleted file mode 100644 index de271d6eebd..00000000000 --- a/hip/factorization/par_ilut_sweep_kernels.hip.cpp +++ /dev/null @@ -1,120 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include -#include -#include -#include -#include - -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/intrinsics.hpp" -#include "common/cuda_hip/components/memory.hpp" -#include "common/cuda_hip/components/merging.hpp" -#include "common/cuda_hip/components/prefix_sum.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/searching.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/factorization/par_ilut_kernels.hpp" -#include "core/matrix/coo_builder.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The parallel ILUT factorization namespace. - * - * @ingroup factor - */ -namespace par_ilut_factorization { - - -constexpr int default_block_size = 512; - - -// subwarp sizes for all warp-parallel kernels (filter, add_candidates) -using compiled_kernels = - syn::value_list; - - -#include "common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc" - - -namespace { - - -template -void compute_l_u_factors(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* a, - matrix::Csr* l, - const matrix::Coo* l_coo, - matrix::Csr* u, - const matrix::Coo* u_coo, - matrix::Csr* u_csc) -{ - auto total_nnz = static_cast(l->get_num_stored_elements() + - u->get_num_stored_elements()); - auto block_size = default_block_size / subwarp_size; - auto num_blocks = ceildiv(total_nnz, block_size); - if (num_blocks > 0) { - kernel::sweep - <<get_stream()>>>( - a->get_const_row_ptrs(), a->get_const_col_idxs(), - as_device_type(a->get_const_values()), l->get_const_row_ptrs(), - l_coo->get_const_row_idxs(), l->get_const_col_idxs(), - as_device_type(l->get_values()), - static_cast(l->get_num_stored_elements()), - u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(), - as_device_type(u->get_values()), u_csc->get_const_row_ptrs(), - u_csc->get_const_col_idxs(), - as_device_type(u_csc->get_values()), - static_cast(u->get_num_stored_elements())); - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors, - compute_l_u_factors); - - -} // namespace - - -template -void compute_l_u_factors(std::shared_ptr exec, - const matrix::Csr* a, - matrix::Csr* l, - const matrix::Coo* l_coo, - matrix::Csr* u, - const matrix::Coo* u_coo, - matrix::Csr* u_csc) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = - l->get_num_stored_elements() + u->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_compute_l_u_factors( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo, - u_csc); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL); - - -} // namespace par_ilut_factorization -} // namespace hip -} // namespace kernels -} // namespace gko From fd62a02bc753462c662e54a860bc056f5a3900c4 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 12 Jul 2024 17:30:46 +0200 Subject: [PATCH 087/448] unify ParILUT/ICT --- common/cuda_hip/CMakeLists.txt | 4 + .../par_ilut_approx_filter_kernels.cpp | 12 +- .../factorization/par_ilut_config.hpp | 29 +++ .../factorization/par_ilut_filter_kernels.cpp | 12 +- ...ls.hpp.inc => par_ilut_filter_kernels.hpp} | 26 +++ .../factorization/par_ilut_select_common.cpp | 10 +- .../factorization/par_ilut_select_common.hpp | 11 +- .../factorization/par_ilut_select_kernels.cpp | 11 +- ...ls.hpp.inc => par_ilut_select_kernels.hpp} | 24 +++ cuda/CMakeLists.txt | 4 - .../par_ilut_approx_filter_kernels.cu | 179 ------------------ cuda/factorization/par_ilut_filter_kernels.cu | 137 -------------- cuda/factorization/par_ilut_select_common.cu | 95 ---------- hip/CMakeLists.txt | 4 - .../par_ilut_select_common.hip.hpp | 51 ----- .../par_ilut_select_kernels.hip.cpp | 158 ---------------- 16 files changed, 106 insertions(+), 661 deletions(-) rename hip/factorization/par_ilut_approx_filter_kernels.hip.cpp => common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp (97%) create mode 100644 common/cuda_hip/factorization/par_ilut_config.hpp rename hip/factorization/par_ilut_filter_kernels.hip.cpp => common/cuda_hip/factorization/par_ilut_filter_kernels.cpp (98%) rename common/cuda_hip/factorization/{par_ilut_filter_kernels.hpp.inc => par_ilut_filter_kernels.hpp} (85%) rename hip/factorization/par_ilut_select_common.hip.cpp => common/cuda_hip/factorization/par_ilut_select_common.cpp (96%) rename cuda/factorization/par_ilut_select_common.cuh => common/cuda_hip/factorization/par_ilut_select_common.hpp (79%) rename cuda/factorization/par_ilut_select_kernels.cu => common/cuda_hip/factorization/par_ilut_select_kernels.cpp (97%) rename common/cuda_hip/factorization/{par_ilut_select_kernels.hpp.inc => par_ilut_select_kernels.hpp} (91%) delete mode 100644 cuda/factorization/par_ilut_approx_filter_kernels.cu delete mode 100644 cuda/factorization/par_ilut_filter_kernels.cu delete mode 100644 cuda/factorization/par_ilut_select_common.cu delete mode 100644 hip/factorization/par_ilut_select_common.hip.hpp delete mode 100644 hip/factorization/par_ilut_select_kernels.hip.cpp diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index c4a56482b1d..a333ea9569c 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -15,6 +15,10 @@ set(CUDA_HIP_SOURCES factorization/par_ic_kernels.cpp factorization/par_ict_kernels.cpp factorization/par_ilu_kernels.cpp + factorization/par_ilut_approx_filter_kernels.cpp + factorization/par_ilut_filter_kernels.cpp + factorization/par_ilut_select_common.cpp + factorization/par_ilut_select_kernels.cpp factorization/par_ilut_spgeam_kernels.cpp factorization/par_ilut_sweep_kernels.cpp matrix/coo_kernels.cpp diff --git a/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp b/common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp similarity index 97% rename from hip/factorization/par_ilut_approx_filter_kernels.hip.cpp rename to common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp index 31482cd4034..12d8da9e4f5 100644 --- a/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp +++ b/common/cuda_hip/factorization/par_ilut_approx_filter_kernels.cpp @@ -20,18 +20,20 @@ #include "common/cuda_hip/components/prefix_sum.hpp" #include "common/cuda_hip/components/sorting.hpp" #include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp" +#include "common/cuda_hip/factorization/par_ilut_select_common.hpp" +#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/factorization/par_ilut_select_common.hip.hpp" namespace gko { namespace kernels { -namespace hip { +namespace GKO_DEVICE_NAMESPACE { /** * @brief The parallel ILUT factorization namespace. * @@ -45,10 +47,6 @@ using compiled_kernels = syn::value_list; -#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc" -#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc" - - template void threshold_filter_approx(syn::value_list, std::shared_ptr exec, @@ -175,6 +173,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace par_ilut_factorization -} // namespace hip +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/common/cuda_hip/factorization/par_ilut_config.hpp b/common/cuda_hip/factorization/par_ilut_config.hpp new file mode 100644 index 00000000000..0aaa6211bd6 --- /dev/null +++ b/common/cuda_hip/factorization/par_ilut_config.hpp @@ -0,0 +1,29 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_CONFIG_HIP_HPP_ +#define GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_CONFIG_HIP_HPP_ + +#include "common/cuda_hip/base/config.hpp" + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace par_ilut_factorization { + + +constexpr int default_block_size = 512; + + +// subwarp sizes for add_candidates kernels +using compiled_kernels = + syn::value_list; + + +} // namespace par_ilut_factorization +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + +#endif // GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_CONFIG_HIP_HPP_ diff --git a/hip/factorization/par_ilut_filter_kernels.hip.cpp b/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp similarity index 98% rename from hip/factorization/par_ilut_filter_kernels.hip.cpp rename to common/cuda_hip/factorization/par_ilut_filter_kernels.cpp index bbe0b197d7c..25432fb44d2 100644 --- a/hip/factorization/par_ilut_filter_kernels.hip.cpp +++ b/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp" + #include #include #include @@ -25,7 +27,7 @@ namespace gko { namespace kernels { -namespace hip { +namespace GKO_DEVICE_NAMESPACE { /** * @brief The parallel ILUT factorization namespace. * @@ -34,17 +36,11 @@ namespace hip { namespace par_ilut_factorization { -constexpr int default_block_size = 512; - - // subwarp sizes for filter kernels using compiled_kernels = syn::value_list; -#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc" - - namespace { @@ -132,6 +128,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace par_ilut_factorization -} // namespace hip +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_filter_kernels.hpp similarity index 85% rename from common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc rename to common/cuda_hip/factorization/par_ilut_filter_kernels.hpp index 68794bfc8d1..6312c1af5f5 100644 --- a/common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ilut_filter_kernels.hpp @@ -2,6 +2,26 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_FILTER_KERNELS_HIP_HPP_ +#define GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_FILTER_KERNELS_HIP_HPP_ + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/sorting.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/factorization/par_ilut_config.hpp" +#include "core/factorization/par_ilut_kernels.hpp" + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace par_ilut_factorization { namespace kernel { @@ -162,3 +182,9 @@ __global__ __launch_bounds__(default_block_size) void bucket_filter( } // namespace kernel +} // namespace par_ilut_factorization +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + +#endif // GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_FILTER_KERNELS_HIP_HPP_ diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/common/cuda_hip/factorization/par_ilut_select_common.cpp similarity index 96% rename from hip/factorization/par_ilut_select_common.hip.cpp rename to common/cuda_hip/factorization/par_ilut_select_common.cpp index 89ceca0a024..fccb89fcf5a 100644 --- a/hip/factorization/par_ilut_select_common.hip.cpp +++ b/common/cuda_hip/factorization/par_ilut_select_common.cpp @@ -8,7 +8,7 @@ // clang-format on -#include "hip/factorization/par_ilut_select_common.hip.hpp" +#include "common/cuda_hip/factorization/par_ilut_select_common.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/components/atomic.hpp" @@ -17,13 +17,14 @@ #include "common/cuda_hip/components/searching.hpp" #include "common/cuda_hip/components/sorting.hpp" #include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" namespace gko { namespace kernels { -namespace hip { +namespace GKO_DEVICE_NAMESPACE { /** * @brief The parallel ILUT factorization namespace. * @@ -32,9 +33,6 @@ namespace hip { namespace par_ilut_factorization { -#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc" - - template void sampleselect_count(std::shared_ptr exec, const ValueType* values, IndexType size, @@ -96,6 +94,6 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(DECLARE_SSSS_FIND_BUCKET); } // namespace par_ilut_factorization -} // namespace hip +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/cuda/factorization/par_ilut_select_common.cuh b/common/cuda_hip/factorization/par_ilut_select_common.hpp similarity index 79% rename from cuda/factorization/par_ilut_select_common.cuh rename to common/cuda_hip/factorization/par_ilut_select_common.hpp index 4cb7dd55258..eca9e5cc4ac 100644 --- a/cuda/factorization/par_ilut_select_common.cuh +++ b/common/cuda_hip/factorization/par_ilut_select_common.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_CUDA_FACTORIZATION_PAR_ILUT_SELECT_COMMON_CUH_ -#define GKO_CUDA_FACTORIZATION_PAR_ILUT_SELECT_COMMON_CUH_ +#ifndef GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_SELECT_COMMON_HPP_ +#define GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_SELECT_COMMON_HPP_ #include @@ -13,11 +13,10 @@ namespace gko { namespace kernels { -namespace cuda { +namespace GKO_DEVICE_NAMESPACE { namespace par_ilut_factorization { -constexpr int default_block_size = 512; constexpr int items_per_thread = 16; @@ -43,9 +42,9 @@ sampleselect_bucket sampleselect_find_bucket( } // namespace par_ilut_factorization -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko -#endif // GKO_CUDA_FACTORIZATION_PAR_ILUT_SELECT_COMMON_CUH_ +#endif // GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_SELECT_COMMON_HPP_ diff --git a/cuda/factorization/par_ilut_select_kernels.cu b/common/cuda_hip/factorization/par_ilut_select_kernels.cpp similarity index 97% rename from cuda/factorization/par_ilut_select_kernels.cu rename to common/cuda_hip/factorization/par_ilut_select_kernels.cpp index a2395a16aea..e03ee379977 100644 --- a/cuda/factorization/par_ilut_select_kernels.cu +++ b/common/cuda_hip/factorization/par_ilut_select_kernels.cpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp" + #include #include @@ -16,14 +18,14 @@ #include "common/cuda_hip/components/searching.hpp" #include "common/cuda_hip/components/sorting.hpp" #include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/factorization/par_ilut_select_common.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" -#include "cuda/factorization/par_ilut_select_common.cuh" namespace gko { namespace kernels { -namespace cuda { +namespace GKO_DEVICE_NAMESPACE { /** * @brief The parallel ILUT factorization namespace. * @@ -32,9 +34,6 @@ namespace cuda { namespace par_ilut_factorization { -#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc" - - template void sampleselect_filter(std::shared_ptr exec, const ValueType* values, IndexType size, @@ -153,6 +152,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace par_ilut_factorization -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp similarity index 91% rename from common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc rename to common/cuda_hip/factorization/par_ilut_select_kernels.hpp index 2ee5061d4c5..6f5940c2b14 100644 --- a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp @@ -2,6 +2,24 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_SELECT_KERNELS_HIP_HPP_ +#define GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_SELECT_KERNELS_HIP_HPP_ + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/sorting.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/factorization/par_ilut_config.hpp" +#include "core/factorization/par_ilut_kernels.hpp" + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace par_ilut_factorization { namespace kernel { @@ -278,3 +296,9 @@ __global__ __launch_bounds__(config::warp_size) void find_bucket( } // namespace kernel +} // namespace par_ilut_factorization +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + +#endif // GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_SELECT_KERNELS_HIP_HPP_ \ No newline at end of file diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index ae506faed4b..ba02918928c 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -18,10 +18,6 @@ target_sources(ginkgo_cuda base/stream.cpp base/timer.cpp base/version.cpp - factorization/par_ilut_approx_filter_kernels.cu - factorization/par_ilut_filter_kernels.cu - factorization/par_ilut_select_common.cu - factorization/par_ilut_select_kernels.cu matrix/batch_csr_kernels.cu matrix/batch_dense_kernels.cu matrix/batch_ell_kernels.cu diff --git a/cuda/factorization/par_ilut_approx_filter_kernels.cu b/cuda/factorization/par_ilut_approx_filter_kernels.cu deleted file mode 100644 index 93c0ef7fc95..00000000000 --- a/cuda/factorization/par_ilut_approx_filter_kernels.cu +++ /dev/null @@ -1,179 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include - -#include -#include -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/atomic.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/intrinsics.hpp" -#include "common/cuda_hip/components/prefix_sum.hpp" -#include "common/cuda_hip/components/sorting.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/factorization/par_ilut_kernels.hpp" -#include "core/matrix/coo_builder.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "cuda/factorization/par_ilut_select_common.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The parallel ILUT factorization namespace. - * - * @ingroup factor - */ -namespace par_ilut_factorization { - - -// subwarp sizes for filter kernels -using compiled_kernels = - syn::value_list; - - -#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc" -#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc" - - -template -void threshold_filter_approx(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* m, - IndexType rank, array* tmp, - remove_complex* threshold, - matrix::Csr* m_out, - matrix::Coo* m_out_coo) -{ - auto values = m->get_const_values(); - IndexType size = m->get_num_stored_elements(); - using AbsType = remove_complex; - constexpr auto bucket_count = kernel::searchtree_width; - auto max_num_threads = ceildiv(size, items_per_thread); - auto max_num_blocks = ceildiv(max_num_threads, default_block_size); - - size_type tmp_size_totals = - ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType)); - size_type tmp_size_partials = ceildiv( - bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType)); - size_type tmp_size_oracles = - ceildiv(size * sizeof(unsigned char), sizeof(ValueType)); - size_type tmp_size_tree = - ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType)); - size_type tmp_size = - tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree; - tmp->resize_and_reset(tmp_size); - - auto total_counts = reinterpret_cast(tmp->get_data()); - auto partial_counts = - reinterpret_cast(tmp->get_data() + tmp_size_totals); - auto oracles = reinterpret_cast( - tmp->get_data() + tmp_size_totals + tmp_size_partials); - auto tree = - reinterpret_cast(tmp->get_data() + tmp_size_totals + - tmp_size_partials + tmp_size_oracles); - - sampleselect_count(exec, values, size, tree, oracles, partial_counts, - total_counts); - - // determine bucket with correct rank - auto bucket = static_cast( - sampleselect_find_bucket(exec, total_counts, rank).idx); - *threshold = - exec->copy_val_to_host(tree + kernel::searchtree_inner_size + bucket); - // we implicitly set the first splitter to -inf, but 0 works as well - if (bucket == 0) { - *threshold = zero(); - } - - // filter the elements - auto old_row_ptrs = m->get_const_row_ptrs(); - auto old_col_idxs = m->get_const_col_idxs(); - auto old_vals = m->get_const_values(); - // compute nnz for each row - auto num_rows = static_cast(m->get_size()[0]); - auto block_size = default_block_size / subwarp_size; - auto num_blocks = ceildiv(num_rows, block_size); - auto new_row_ptrs = m_out->get_row_ptrs(); - if (num_blocks > 0) { - kernel::bucket_filter_nnz - <<get_stream()>>>( - old_row_ptrs, oracles, num_rows, bucket, new_row_ptrs); - } - - // build row pointers - components::prefix_sum_nonnegative(exec, new_row_ptrs, num_rows + 1); - - // build matrix - auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows); - // resize arrays and update aliases - matrix::CsrBuilder builder{m_out}; - builder.get_col_idx_array().resize_and_reset(new_nnz); - builder.get_value_array().resize_and_reset(new_nnz); - auto new_col_idxs = m_out->get_col_idxs(); - auto new_vals = m_out->get_values(); - IndexType* new_row_idxs{}; - if (m_out_coo) { - matrix::CooBuilder coo_builder{m_out_coo}; - coo_builder.get_row_idx_array().resize_and_reset(new_nnz); - coo_builder.get_col_idx_array() = - make_array_view(exec, new_nnz, new_col_idxs); - coo_builder.get_value_array() = - make_array_view(exec, new_nnz, new_vals); - new_row_idxs = m_out_coo->get_row_idxs(); - } - if (num_blocks > 0) { - kernel::bucket_filter - <<get_stream()>>>( - old_row_ptrs, old_col_idxs, as_device_type(old_vals), oracles, - num_rows, bucket, new_row_ptrs, new_row_idxs, new_col_idxs, - as_device_type(new_vals)); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter_approx, - threshold_filter_approx); - - -template -void threshold_filter_approx(std::shared_ptr exec, - const matrix::Csr* m, - IndexType rank, array& tmp, - remove_complex& threshold, - matrix::Csr* m_out, - matrix::Coo* m_out_coo) -{ - auto num_rows = m->get_size()[0]; - auto total_nnz = m->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_threshold_filter_approx( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, m, rank, &tmp, - &threshold, m_out, m_out_coo); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL); - - -} // namespace par_ilut_factorization -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/factorization/par_ilut_filter_kernels.cu b/cuda/factorization/par_ilut_filter_kernels.cu deleted file mode 100644 index 3d6b41f07e6..00000000000 --- a/cuda/factorization/par_ilut_filter_kernels.cu +++ /dev/null @@ -1,137 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include -#include -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/intrinsics.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/factorization/par_ilut_kernels.hpp" -#include "core/matrix/coo_builder.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The parallel ILUT factorization namespace. - * - * @ingroup factor - */ -namespace par_ilut_factorization { - - -constexpr int default_block_size = 512; - - -// subwarp sizes for filter kernels -using compiled_kernels = - syn::value_list; - - -#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc" - - -namespace { - - -template -void threshold_filter(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* a, - remove_complex threshold, - matrix::Csr* m_out, - matrix::Coo* m_out_coo, bool lower) -{ - auto old_row_ptrs = a->get_const_row_ptrs(); - auto old_col_idxs = a->get_const_col_idxs(); - auto old_vals = a->get_const_values(); - // compute nnz for each row - auto num_rows = static_cast(a->get_size()[0]); - auto block_size = default_block_size / subwarp_size; - auto num_blocks = ceildiv(num_rows, block_size); - auto new_row_ptrs = m_out->get_row_ptrs(); - if (num_blocks > 0) { - kernel::threshold_filter_nnz - <<get_stream()>>>( - old_row_ptrs, as_device_type(old_vals), num_rows, - as_device_type(threshold), new_row_ptrs, lower); - } - - // build row pointers - components::prefix_sum_nonnegative(exec, new_row_ptrs, num_rows + 1); - - // build matrix - auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows); - // resize arrays and update aliases - matrix::CsrBuilder builder{m_out}; - builder.get_col_idx_array().resize_and_reset(new_nnz); - builder.get_value_array().resize_and_reset(new_nnz); - auto new_col_idxs = m_out->get_col_idxs(); - auto new_vals = m_out->get_values(); - IndexType* new_row_idxs{}; - if (m_out_coo) { - matrix::CooBuilder coo_builder{m_out_coo}; - coo_builder.get_row_idx_array().resize_and_reset(new_nnz); - coo_builder.get_col_idx_array() = - make_array_view(exec, new_nnz, new_col_idxs); - coo_builder.get_value_array() = - make_array_view(exec, new_nnz, new_vals); - new_row_idxs = m_out_coo->get_row_idxs(); - } - if (num_blocks > 0) { - kernel::threshold_filter - <<get_stream()>>>( - old_row_ptrs, old_col_idxs, as_device_type(old_vals), num_rows, - as_device_type(threshold), new_row_ptrs, new_row_idxs, - new_col_idxs, as_device_type(new_vals), lower); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter, threshold_filter); - - -} // namespace - -template -void threshold_filter(std::shared_ptr exec, - const matrix::Csr* a, - remove_complex threshold, - matrix::Csr* m_out, - matrix::Coo* m_out_coo, bool lower) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = a->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_threshold_filter( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, a, threshold, m_out, - m_out_coo, lower); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL); - - -} // namespace par_ilut_factorization -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/factorization/par_ilut_select_common.cu b/cuda/factorization/par_ilut_select_common.cu deleted file mode 100644 index e0b81a81a1c..00000000000 --- a/cuda/factorization/par_ilut_select_common.cu +++ /dev/null @@ -1,95 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "cuda/factorization/par_ilut_select_common.cuh" - -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/components/atomic.hpp" -#include "common/cuda_hip/components/intrinsics.hpp" -#include "common/cuda_hip/components/prefix_sum.hpp" -#include "common/cuda_hip/components/searching.hpp" -#include "common/cuda_hip/components/sorting.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/factorization/par_ilut_kernels.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The parallel ILUT factorization namespace. - * - * @ingroup factor - */ -namespace par_ilut_factorization { - - -#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc" - - -template -void sampleselect_count(std::shared_ptr exec, - const ValueType* values, IndexType size, - remove_complex* tree, unsigned char* oracles, - IndexType* partial_counts, IndexType* total_counts) -{ - constexpr auto bucket_count = kernel::searchtree_width; - auto num_threads_total = ceildiv(size, items_per_thread); - auto num_blocks = - static_cast(ceildiv(num_threads_total, default_block_size)); - // pick sample, build searchtree - kernel::build_searchtree<<<1, bucket_count, 0, exec->get_stream()>>>( - as_device_type(values), size, as_device_type(tree)); - // determine bucket sizes - if (num_blocks > 0) { - kernel::count_buckets<<get_stream()>>>( - as_device_type(values), size, as_device_type(tree), partial_counts, - oracles, items_per_thread); - } - // compute prefix sum and total sum over block-local values - kernel::block_prefix_sum<<get_stream()>>>(partial_counts, - total_counts, num_blocks); - // compute prefix sum over bucket counts - components::prefix_sum_nonnegative(exec, total_counts, bucket_count + 1); -} - - -#define DECLARE_SSSS_COUNT(ValueType, IndexType) \ - void sampleselect_count(std::shared_ptr exec, \ - const ValueType* values, IndexType size, \ - remove_complex* tree, \ - unsigned char* oracles, IndexType* partial_counts, \ - IndexType* total_counts) - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(DECLARE_SSSS_COUNT); - - -template -sampleselect_bucket sampleselect_find_bucket( - std::shared_ptr exec, IndexType* prefix_sum, - IndexType rank) -{ - kernel::find_bucket<<<1, config::warp_size, 0, exec->get_stream()>>>( - prefix_sum, rank); - IndexType values[3]{}; - exec->get_master()->copy_from(exec, 3, prefix_sum, values); - return {values[0], values[1], values[2]}; -} - - -#define DECLARE_SSSS_FIND_BUCKET(IndexType) \ - sampleselect_bucket sampleselect_find_bucket( \ - std::shared_ptr exec, IndexType* prefix_sum, \ - IndexType rank) - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(DECLARE_SSSS_FIND_BUCKET); - - -} // namespace par_ilut_factorization -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 0bfe56d7db1..4dd54c53782 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -16,10 +16,6 @@ set(GINKGO_HIP_SOURCES base/stream.hip.cpp base/timer.hip.cpp base/version.hip.cpp - factorization/par_ilut_approx_filter_kernels.hip.cpp - factorization/par_ilut_filter_kernels.hip.cpp - factorization/par_ilut_select_common.hip.cpp - factorization/par_ilut_select_kernels.hip.cpp matrix/batch_csr_kernels.hip.cpp matrix/batch_dense_kernels.hip.cpp matrix/batch_ell_kernels.hip.cpp diff --git a/hip/factorization/par_ilut_select_common.hip.hpp b/hip/factorization/par_ilut_select_common.hip.hpp deleted file mode 100644 index 290de30f5df..00000000000 --- a/hip/factorization/par_ilut_select_common.hip.hpp +++ /dev/null @@ -1,51 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_FACTORIZATION_PAR_ILUT_SELECT_COMMON_HIP_HPP_ -#define GKO_HIP_FACTORIZATION_PAR_ILUT_SELECT_COMMON_HIP_HPP_ - - -#include -#include -#include - - -namespace gko { -namespace kernels { -namespace hip { -namespace par_ilut_factorization { - - -constexpr int default_block_size = 512; -constexpr int items_per_thread = 16; - - -template -void sampleselect_count(std::shared_ptr exec, - const ValueType* values, IndexType size, - remove_complex* tree, unsigned char* oracles, - IndexType* partial_counts, IndexType* total_counts); - - -template -struct sampleselect_bucket { - IndexType idx; - IndexType begin; - IndexType size; -}; - - -template -sampleselect_bucket sampleselect_find_bucket( - std::shared_ptr exec, IndexType* prefix_sum, - IndexType rank); - - -} // namespace par_ilut_factorization -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_FACTORIZATION_PAR_ILUT_SELECT_COMMON_HIP_HPP_ diff --git a/hip/factorization/par_ilut_select_kernels.hip.cpp b/hip/factorization/par_ilut_select_kernels.hip.cpp deleted file mode 100644 index 2e75f7de81b..00000000000 --- a/hip/factorization/par_ilut_select_kernels.hip.cpp +++ /dev/null @@ -1,158 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/atomic.hpp" -#include "common/cuda_hip/components/intrinsics.hpp" -#include "common/cuda_hip/components/prefix_sum.hpp" -#include "common/cuda_hip/components/searching.hpp" -#include "common/cuda_hip/components/sorting.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/factorization/par_ilut_kernels.hpp" -#include "hip/factorization/par_ilut_select_common.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The parallel ILUT factorization namespace. - * - * @ingroup factor - */ -namespace par_ilut_factorization { - - -#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc" - - -template -void sampleselect_filter(std::shared_ptr exec, - const ValueType* values, IndexType size, - const unsigned char* oracles, - const IndexType* partial_counts, IndexType bucket, - remove_complex* out) -{ - auto num_threads_total = ceildiv(size, items_per_thread); - auto num_blocks = - static_cast(ceildiv(num_threads_total, default_block_size)); - if (num_blocks > 0) { - kernel::filter_bucket<<get_stream()>>>( - as_device_type(values), size, bucket, oracles, partial_counts, - as_device_type(out), items_per_thread); - } -} - - -template -void threshold_select(std::shared_ptr exec, - const matrix::Csr* m, - IndexType rank, array& tmp1, - array>& tmp2, - remove_complex& threshold) -{ - auto values = m->get_const_values(); - IndexType size = m->get_num_stored_elements(); - using AbsType = remove_complex; - constexpr auto bucket_count = kernel::searchtree_width; - auto max_num_threads = ceildiv(size, items_per_thread); - auto max_num_blocks = ceildiv(max_num_threads, default_block_size); - - size_type tmp_size_totals = - ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType)); - size_type tmp_size_partials = ceildiv( - bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType)); - size_type tmp_size_oracles = - ceildiv(size * sizeof(unsigned char), sizeof(ValueType)); - size_type tmp_size_tree = - ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType)); - size_type tmp_size_vals = - size / bucket_count * 4; // pessimistic estimate for temporary storage - size_type tmp_size = - tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree; - tmp1.resize_and_reset(tmp_size); - tmp2.resize_and_reset(tmp_size_vals); - - auto total_counts = reinterpret_cast(tmp1.get_data()); - auto partial_counts = - reinterpret_cast(tmp1.get_data() + tmp_size_totals); - auto oracles = reinterpret_cast( - tmp1.get_data() + tmp_size_totals + tmp_size_partials); - auto tree = - reinterpret_cast(tmp1.get_data() + tmp_size_totals + - tmp_size_partials + tmp_size_oracles); - - sampleselect_count(exec, values, size, tree, oracles, partial_counts, - total_counts); - - // determine bucket with correct rank, use bucket-local rank - auto bucket = sampleselect_find_bucket(exec, total_counts, rank); - rank -= bucket.begin; - - if (bucket.size * 2 > tmp_size_vals) { - // we need to reallocate tmp2 - tmp2.resize_and_reset(bucket.size * 2); - } - auto tmp21 = tmp2.get_data(); - auto tmp22 = tmp2.get_data() + bucket.size; - // extract target bucket - sampleselect_filter(exec, values, size, oracles, partial_counts, bucket.idx, - tmp22); - - // recursively select from smaller buckets - int step{}; - while (bucket.size > kernel::basecase_size) { - std::swap(tmp21, tmp22); - const auto* tmp_in = tmp21; - auto tmp_out = tmp22; - - sampleselect_count(exec, tmp_in, bucket.size, tree, oracles, - partial_counts, total_counts); - auto new_bucket = sampleselect_find_bucket(exec, total_counts, rank); - sampleselect_filter(exec, tmp_in, bucket.size, oracles, partial_counts, - bucket.idx, tmp_out); - - rank -= new_bucket.begin; - bucket.size = new_bucket.size; - // we should never need more than 5 recursion steps, this would mean - // 256^5 = 2^40. fall back to standard library algorithm in that case. - ++step; - if (step > 5) { - array cpu_out_array{ - exec->get_master(), - make_array_view(exec, bucket.size, tmp_out)}; - auto begin = cpu_out_array.get_data(); - auto end = begin + bucket.size; - auto middle = begin + rank; - std::nth_element(begin, middle, end); - threshold = *middle; - return; - } - } - - // base case - auto out_ptr = reinterpret_cast(tmp1.get_data()); - kernel::basecase_select<<<1, kernel::basecase_block_size, 0, - exec->get_stream()>>>( - as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr)); - threshold = exec->copy_val_to_host(out_ptr); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL); - - -} // namespace par_ilut_factorization -} // namespace hip -} // namespace kernels -} // namespace gko From afef0b8bca517ea4e1d0de97110c3403ba3b7ace Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 13 Jul 2024 00:19:21 +0200 Subject: [PATCH 088/448] unify index_set kernels --- common/cuda_hip/CMakeLists.txt | 1 + .../cuda_hip}/base/index_set_kernels.cpp | 14 +--- cuda/CMakeLists.txt | 1 - hip/CMakeLists.txt | 1 - hip/base/index_set_kernels.hip.cpp | 83 ------------------- 5 files changed, 3 insertions(+), 97 deletions(-) rename {cuda => common/cuda_hip}/base/index_set_kernels.cpp (93%) delete mode 100644 hip/base/index_set_kernels.hip.cpp diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index a333ea9569c..88353204488 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -1,6 +1,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) set(CUDA_HIP_SOURCES base/device_matrix_data_kernels.cpp + base/index_set_kernels.cpp components/prefix_sum_kernels.cpp distributed/index_map_kernels.cpp distributed/matrix_kernels.cpp diff --git a/cuda/base/index_set_kernels.cpp b/common/cuda_hip/base/index_set_kernels.cpp similarity index 93% rename from cuda/base/index_set_kernels.cpp rename to common/cuda_hip/base/index_set_kernels.cpp index 2041833e4c2..0a47752d17e 100644 --- a/cuda/base/index_set_kernels.cpp +++ b/common/cuda_hip/base/index_set_kernels.cpp @@ -13,17 +13,7 @@ namespace gko { namespace kernels { -/** - * @brief The Cuda namespace. - * - * @ingroup cuda - */ -namespace cuda { -/** - * @brief The index_set namespace. - * - * @ingroup index_set - */ +namespace GKO_DEVICE_NAMESPACE { namespace idx_set { @@ -78,6 +68,6 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( } // namespace idx_set -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index ba02918928c..c9bb448b79b 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -11,7 +11,6 @@ target_sources(ginkgo_cuda base/device.cpp base/exception.cpp base/executor.cpp - base/index_set_kernels.cpp base/memory.cpp base/nvtx.cpp base/scoped_device_id.cpp diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 4dd54c53782..e6a337da7b9 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -9,7 +9,6 @@ set(GINKGO_HIP_SOURCES base/device.hip.cpp base/exception.hip.cpp base/executor.hip.cpp - base/index_set_kernels.hip.cpp base/memory.hip.cpp base/roctx.hip.cpp base/scoped_device_id.hip.cpp diff --git a/hip/base/index_set_kernels.hip.cpp b/hip/base/index_set_kernels.hip.cpp deleted file mode 100644 index 9f9f967fe35..00000000000 --- a/hip/base/index_set_kernels.hip.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/base/index_set_kernels.hpp" - -#include - -#include -#include -#include - - -namespace gko { -namespace kernels { -/** - * @brief The Hip namespace. - * - * @ingroup hip - */ -namespace hip { -/** - * @brief The index_set namespace. - * - * @ingroup index_set - */ -namespace idx_set { - - -template -void to_global_indices(std::shared_ptr exec, - const IndexType num_subsets, - const IndexType* subset_begin, - const IndexType* subset_end, - const IndexType* superset_indices, - IndexType* decomp_indices) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( - GKO_DECLARE_INDEX_SET_TO_GLOBAL_INDICES_KERNEL); - - -template -void populate_subsets(std::shared_ptr exec, - const IndexType index_space_size, - const array* indices, - array* subset_begin, - array* subset_end, - array* superset_indices, - const bool is_sorted) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_INDEX_SET_POPULATE_KERNEL); - - -template -void global_to_local(std::shared_ptr exec, - const IndexType index_space_size, - const IndexType num_subsets, const IndexType* subset_begin, - const IndexType* subset_end, - const IndexType* superset_indices, - const IndexType num_indices, - const IndexType* global_indices, IndexType* local_indices, - const bool is_sorted) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( - GKO_DECLARE_INDEX_SET_GLOBAL_TO_LOCAL_KERNEL); - - -template -void local_to_global(std::shared_ptr exec, - const IndexType num_subsets, const IndexType* subset_begin, - const IndexType* superset_indices, - const IndexType num_indices, - const IndexType* local_indices, IndexType* global_indices, - const bool is_sorted) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( - GKO_DECLARE_INDEX_SET_LOCAL_TO_GLOBAL_KERNEL); - - -} // namespace idx_set -} // namespace hip -} // namespace kernels -} // namespace gko From bb999199459d87a6862311e08164a8f0ac50bee6 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 13 Jul 2024 08:50:42 +0200 Subject: [PATCH 089/448] fix ILU --- common/cuda_hip/factorization/ilu_kernels.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/cuda_hip/factorization/ilu_kernels.cpp b/common/cuda_hip/factorization/ilu_kernels.cpp index b3f959bba02..0469b80fe86 100644 --- a/common/cuda_hip/factorization/ilu_kernels.cpp +++ b/common/cuda_hip/factorization/ilu_kernels.cpp @@ -46,7 +46,7 @@ void compute_lu(std::shared_ptr exec, SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); // CUDA 11.4 has a use-after-free bug on Turing -#if defined(GKO_BUILDING_CUDA) && (CUDA_VERSION >= 11040) +#if defined(GKO_COMPILING_CUDA) && (CUDA_VERSION >= 11040) exec->synchronize(); #endif From 43a445ab0d324187280f5c3303ad537d76fa0c10 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 13 Jul 2024 11:26:22 +0200 Subject: [PATCH 090/448] unify Jacobi --- common/cuda_hip/CMakeLists.txt | 25 ++++ .../jacobi_advanced_apply_kernels.cpp | 6 +- .../jacobi_advanced_apply_kernels.hpp.inc | 81 ------------- ...obi_advanced_apply_kernels.instantiate.cpp | 85 ++++++++++++-- .../preconditioner/jacobi_common.hpp.in | 13 +-- .../jacobi_generate_kernels.cpp | 14 +-- ...> jacobi_generate_kernels.instantiate.cpp} | 96 ++++++++++++++++ .../preconditioner/jacobi_kernels.cpp | 5 - .../jacobi_simple_apply_kernels.cpp | 11 +- .../jacobi_simple_apply_kernels.hpp.inc | 76 ------------ ...acobi_simple_apply_kernels.instantiate.cpp | 80 +++++++++++-- cuda/CMakeLists.txt | 32 ++---- .../jacobi_advanced_apply_kernels.cu | 75 ------------ ...cobi_advanced_apply_kernels.instantiate.cu | 100 ---------------- .../preconditioner/jacobi_generate_kernels.cu | 72 ------------ .../jacobi_generate_kernels.instantiate.cu | 108 ------------------ hip/CMakeLists.txt | 39 ++----- hip/preconditioner/jacobi_common.hip.hpp.in | 43 ------- ...acobi_generate_kernels.instantiate.hip.cpp | 108 ------------------ .../jacobi_simple_apply_kernels.hip.cpp | 84 -------------- ...i_simple_apply_kernels.instantiate.hip.cpp | 97 ---------------- 21 files changed, 301 insertions(+), 949 deletions(-) rename hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp => common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp (94%) delete mode 100644 common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc rename hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp => common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp (52%) rename {cuda => common/cuda_hip}/preconditioner/jacobi_common.hpp.in (70%) rename hip/preconditioner/jacobi_generate_kernels.hip.cpp => common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp (92%) rename common/cuda_hip/preconditioner/{jacobi_generate_kernels.hpp.inc => jacobi_generate_kernels.instantiate.cpp} (66%) rename cuda/preconditioner/jacobi_simple_apply_kernels.cu => common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp (92%) delete mode 100644 common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc rename cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu => common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp (52%) delete mode 100644 cuda/preconditioner/jacobi_advanced_apply_kernels.cu delete mode 100644 cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu delete mode 100644 cuda/preconditioner/jacobi_generate_kernels.cu delete mode 100644 cuda/preconditioner/jacobi_generate_kernels.instantiate.cu delete mode 100644 hip/preconditioner/jacobi_common.hip.hpp.in delete mode 100644 hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp delete mode 100644 hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp delete mode 100644 hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index 88353204488..463abfd9284 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -31,6 +31,9 @@ set(CUDA_HIP_SOURCES multigrid/pgm_kernels.cpp preconditioner/isai_kernels.cpp preconditioner/jacobi_kernels.cpp + preconditioner/jacobi_advanced_apply_kernels.cpp + preconditioner/jacobi_generate_kernels.cpp + preconditioner/jacobi_simple_apply_kernels.cpp reorder/rcm_kernels.cpp solver/cb_gmres_kernels.cpp solver/idr_kernels.cpp @@ -38,5 +41,27 @@ set(CUDA_HIP_SOURCES stop/criterion_kernels.cpp stop/residual_norm_kernels.cpp ) +# create files for all potentially used block sizes +foreach(GKO_JACOBI_BLOCK_SIZE RANGE 1 64) + configure_file( + preconditioner/jacobi_generate_kernels.instantiate.cpp + preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cpp) + configure_file( + preconditioner/jacobi_simple_apply_kernels.instantiate.cpp + preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cpp) + configure_file( + preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp + preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cpp) +endforeach() +function(jacobi_generated_files variable_name block_sizes) + set(${variable_name}) + foreach(block_size IN LISTS block_sizes) + list(APPEND variable_name + ${Ginkgo_BINARY_DIR}/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.${block_size}.cpp + ${Ginkgo_BINARY_DIR}/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.${block_size}.cpp + ${Ginkgo_BINARY_DIR}/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.${block_size}.cpp) + endforeach() + set(${variable_name} ${${variable_name}} PARENT_SCOPE) +endfunction() list(TRANSFORM CUDA_HIP_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/) set(GKO_CUDA_HIP_COMMON_SOURCES ${CUDA_HIP_SOURCES} PARENT_SCOPE) diff --git a/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp similarity index 94% rename from hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp rename to common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp index 371a10051fc..27b4f57eb6c 100644 --- a/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp +++ b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.cpp @@ -13,7 +13,7 @@ namespace gko { namespace kernels { -namespace hip { +namespace GKO_DEVICE_NAMESPACE { /** * @brief The Jacobi preconditioner namespace. * @ref Jacobi @@ -38,7 +38,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_advanced_apply, advanced_apply); template -void apply(std::shared_ptr exec, size_type num_blocks, +void apply(std::shared_ptr exec, size_type num_blocks, uint32 max_block_size, const preconditioner::block_interleaved_storage_scheme& storage_scheme, @@ -70,6 +70,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL); } // namespace jacobi -} // namespace hip +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc deleted file mode 100644 index 5d7a6966c78..00000000000 --- a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc +++ /dev/null @@ -1,81 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -namespace kernel { - - -template -__global__ void __launch_bounds__(warps_per_block* config::warp_size) - advanced_apply(const ValueType* __restrict__ blocks, - preconditioner::block_interleaved_storage_scheme - storage_scheme, - const IndexType* __restrict__ block_ptrs, - size_type num_blocks, const ValueType* __restrict__ alpha, - const ValueType* __restrict__ b, int32 b_stride, - ValueType* __restrict__ x, int32 x_stride) -{ - const auto block_id = - thread::get_subwarp_id(); - const auto subwarp = - group::tiled_partition(group::this_thread_block()); - if (block_id >= num_blocks) { - return; - } - const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; - ValueType v = zero(); - if (subwarp.thread_rank() < block_size) { - v = alpha[0] * - b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; - } - multiply_vec( - subwarp, block_size, v, - blocks + storage_scheme.get_global_block_offset(block_id) + - subwarp.thread_rank(), - storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, - x_stride, - [](ValueType& result, const ValueType& out) { result += out; }); -} - - -template -__global__ void -__launch_bounds__(warps_per_block* config::warp_size) advanced_adaptive_apply( - const ValueType* __restrict__ blocks, - preconditioner::block_interleaved_storage_scheme storage_scheme, - const precision_reduction* __restrict__ block_precisions, - const IndexType* __restrict__ block_ptrs, size_type num_blocks, - const ValueType* __restrict__ alpha, const ValueType* __restrict__ b, - int32 b_stride, ValueType* __restrict__ x, int32 x_stride) -{ - const auto block_id = - thread::get_subwarp_id(); - const auto subwarp = - group::tiled_partition(group::this_thread_block()); - if (block_id >= num_blocks) { - return; - } - const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; - auto alpha_val = alpha == nullptr ? one() : alpha[0]; - ValueType v = zero(); - if (subwarp.thread_rank() < block_size) { - v = alpha[0] * - b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; - } - GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( - ValueType, block_precisions[block_id], - multiply_vec( - subwarp, block_size, v, - reinterpret_cast( - blocks + storage_scheme.get_group_offset(block_id)) + - storage_scheme.get_block_offset(block_id) + - subwarp.thread_rank(), - storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, - x_stride, - [](ValueType& result, const ValueType& out) { result += out; })); -} - - -} // namespace kernel diff --git a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp similarity index 52% rename from hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp rename to common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp index 42c542c228b..0ecc3d0d44b 100644 --- a/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp +++ b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp @@ -22,16 +22,85 @@ namespace gko { namespace kernels { -namespace hip { -/** - * @brief The Jacobi preconditioner namespace. - * @ref Jacobi - * @ingroup jacobi - */ +namespace GKO_DEVICE_NAMESPACE { namespace jacobi { +namespace kernel { + + +template +__global__ void __launch_bounds__(warps_per_block* config::warp_size) + advanced_apply(const ValueType* __restrict__ blocks, + preconditioner::block_interleaved_storage_scheme + storage_scheme, + const IndexType* __restrict__ block_ptrs, + size_type num_blocks, const ValueType* __restrict__ alpha, + const ValueType* __restrict__ b, int32 b_stride, + ValueType* __restrict__ x, int32 x_stride) +{ + const auto block_id = + thread::get_subwarp_id(); + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + if (block_id >= num_blocks) { + return; + } + const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; + ValueType v = zero(); + if (subwarp.thread_rank() < block_size) { + v = alpha[0] * + b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; + } + multiply_vec( + subwarp, block_size, v, + blocks + storage_scheme.get_global_block_offset(block_id) + + subwarp.thread_rank(), + storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, + x_stride, + [](ValueType& result, const ValueType& out) { result += out; }); +} + + +template +__global__ void +__launch_bounds__(warps_per_block* config::warp_size) advanced_adaptive_apply( + const ValueType* __restrict__ blocks, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const precision_reduction* __restrict__ block_precisions, + const IndexType* __restrict__ block_ptrs, size_type num_blocks, + const ValueType* __restrict__ alpha, const ValueType* __restrict__ b, + int32 b_stride, ValueType* __restrict__ x, int32 x_stride) +{ + const auto block_id = + thread::get_subwarp_id(); + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + if (block_id >= num_blocks) { + return; + } + const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; + auto alpha_val = alpha == nullptr ? one() : alpha[0]; + ValueType v = zero(); + if (subwarp.thread_rank() < block_size) { + v = alpha[0] * + b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; + } + GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( + ValueType, block_precisions[block_id], + multiply_vec( + subwarp, block_size, v, + reinterpret_cast( + blocks + storage_scheme.get_group_offset(block_id)) + + storage_scheme.get_block_offset(block_id) + + subwarp.thread_rank(), + storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, + x_stride, + [](ValueType& result, const ValueType& out) { result += out; })); +} -#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc" +} // namespace kernel // clang-format off @@ -96,6 +165,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace jacobi -} // namespace hip +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/cuda/preconditioner/jacobi_common.hpp.in b/common/cuda_hip/preconditioner/jacobi_common.hpp.in similarity index 70% rename from cuda/preconditioner/jacobi_common.hpp.in rename to common/cuda_hip/preconditioner/jacobi_common.hpp.in index aeb47fec97e..b243c7de6a5 100644 --- a/cuda/preconditioner/jacobi_common.hpp.in +++ b/common/cuda_hip/preconditioner/jacobi_common.hpp.in @@ -5,12 +5,11 @@ #include #include - #include "common/cuda_hip/base/config.hpp" namespace gko { namespace kernels { -namespace cuda { +namespace GKO_DEVICE_NAMESPACE { namespace jacobi { @@ -19,15 +18,15 @@ namespace jacobi { * kernels should be compiled. */ // clang-format off -#cmakedefine GKO_CUDA_JACOBI_BLOCK_SIZES_CODE @GKO_CUDA_JACOBI_BLOCK_SIZES_CODE@ +#cmakedefine GKO_JACOBI_BLOCK_SIZES_CODE @GKO_JACOBI_BLOCK_SIZES_CODE@ // clang-format on // make things easier for IDEs -#ifndef GKO_CUDA_JACOBI_BLOCK_SIZES_CODE -#define GKO_CUDA_JACOBI_BLOCK_SIZES_CODE 1 +#ifndef GKO_JACOBI_BLOCK_SIZES_CODE +#define GKO_JACOBI_BLOCK_SIZES_CODE 1 #endif -using compiled_kernels = syn::value_list; +using compiled_kernels = syn::value_list; constexpr int get_larger_power(int value, int guess = 1) @@ -37,6 +36,6 @@ constexpr int get_larger_power(int value, int guess = 1) } // namespace jacobi -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/hip/preconditioner/jacobi_generate_kernels.hip.cpp b/common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp similarity index 92% rename from hip/preconditioner/jacobi_generate_kernels.hip.cpp rename to common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp index d295ebb046e..207550ff6b1 100644 --- a/hip/preconditioner/jacobi_generate_kernels.hip.cpp +++ b/common/cuda_hip/preconditioner/jacobi_generate_kernels.cpp @@ -25,18 +25,10 @@ namespace gko { namespace kernels { -namespace hip { -/** - * @brief The Jacobi preconditioner namespace. - * @ref Jacobi - * @ingroup jacobi - */ +namespace GKO_DEVICE_NAMESPACE { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc" - - template void generate(syn::value_list, @@ -53,7 +45,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generate, generate); template -void generate(std::shared_ptr exec, +void generate(std::shared_ptr exec, const matrix::Csr* system_matrix, size_type num_blocks, uint32 max_block_size, remove_complex accuracy, @@ -81,6 +73,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace jacobi -} // namespace hip +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp similarity index 66% rename from common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc rename to common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp index 61a57ca5f81..d004309c622 100644 --- a/common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc +++ b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp @@ -2,6 +2,30 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/diagonal_block_manipulation.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" +#include "core/base/extended_float.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/preconditioner/jacobi_kernels.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" +// generated header +#include "common/cuda_hip/preconditioner/jacobi_common.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace jacobi { namespace kernel { @@ -180,3 +204,75 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_generate( } // namespace kernel + + +// clang-format off +#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ +// clang-format on +// make things easier for IDEs +#ifndef GKO_JACOBI_BLOCK_SIZE +#define GKO_JACOBI_BLOCK_SIZE 1 +#endif + + +template +void generate(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* mtx, + remove_complex accuracy, ValueType* block_data, + const preconditioner::block_interleaved_storage_scheme& + storage_scheme, + remove_complex* conditioning, + precision_reduction* block_precisions, + const IndexType* block_ptrs, size_type num_blocks) +{ + constexpr int subwarp_size = get_larger_power(max_block_size); + constexpr int blocks_per_warp = config::warp_size / subwarp_size; + const auto grid_size = + ceildiv(num_blocks, warps_per_block * blocks_per_warp); + const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); + + if (grid_size > 0) { + if (block_precisions) { + kernel::adaptive_generate + <<get_stream()>>>( + mtx->get_size()[0], mtx->get_const_row_ptrs(), + mtx->get_const_col_idxs(), + as_device_type(mtx->get_const_values()), + as_device_type(accuracy), as_device_type(block_data), + storage_scheme, as_device_type(conditioning), + block_precisions, block_ptrs, num_blocks); + } else { + kernel::generate + <<get_stream()>>>( + mtx->get_size()[0], mtx->get_const_row_ptrs(), + mtx->get_const_col_idxs(), + as_device_type(mtx->get_const_values()), + as_device_type(block_data), storage_scheme, block_ptrs, + num_blocks); + } + } +} + + +#define DECLARE_JACOBI_GENERATE_INSTANTIATION(ValueType, IndexType) \ + void generate( \ + syn::value_list, \ + std::shared_ptr exec, \ + const matrix::Csr*, remove_complex, \ + ValueType*, \ + const preconditioner::block_interleaved_storage_scheme&, \ + remove_complex*, precision_reduction*, const IndexType*, \ + size_type) + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + DECLARE_JACOBI_GENERATE_INSTANTIATION); + + +} // namespace jacobi +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_kernels.cpp index 8cf5ad1e9fd..f3b099e7c18 100644 --- a/common/cuda_hip/preconditioner/jacobi_kernels.cpp +++ b/common/cuda_hip/preconditioner/jacobi_kernels.cpp @@ -22,11 +22,6 @@ namespace gko { namespace kernels { namespace GKO_DEVICE_NAMESPACE { -/** - * @brief The Jacobi preconditioner namespace. - * @ref Jacobi - * @ingroup jacobi - */ namespace jacobi { diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.cu b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp similarity index 92% rename from cuda/preconditioner/jacobi_simple_apply_kernels.cu rename to common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp index 62e49d30618..e9b7b10fd88 100644 --- a/cuda/preconditioner/jacobi_simple_apply_kernels.cu +++ b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.cpp @@ -13,12 +13,7 @@ namespace gko { namespace kernels { -namespace cuda { -/** - * @brief The Jacobi preconditioner namespace. - * @ref Jacobi - * @ingroup jacobi - */ +namespace GKO_DEVICE_NAMESPACE { namespace jacobi { @@ -38,7 +33,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_apply, apply); template void simple_apply( - std::shared_ptr exec, size_type num_blocks, + std::shared_ptr exec, size_type num_blocks, uint32 max_block_size, const preconditioner::block_interleaved_storage_scheme& storage_scheme, @@ -67,6 +62,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace jacobi -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc deleted file mode 100644 index c39016810fa..00000000000 --- a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc +++ /dev/null @@ -1,76 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -namespace kernel { - - -template -__global__ void __launch_bounds__(warps_per_block* config::warp_size) apply( - const ValueType* __restrict__ blocks, - preconditioner::block_interleaved_storage_scheme storage_scheme, - const IndexType* __restrict__ block_ptrs, size_type num_blocks, - const ValueType* __restrict__ b, int32 b_stride, ValueType* __restrict__ x, - int32 x_stride) -{ - const auto block_id = - thread::get_subwarp_id(); - const auto subwarp = - group::tiled_partition(group::this_thread_block()); - if (block_id >= num_blocks) { - return; - } - const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; - ValueType v = zero(); - if (subwarp.thread_rank() < block_size) { - v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; - } - multiply_vec( - subwarp, block_size, v, - blocks + storage_scheme.get_global_block_offset(block_id) + - subwarp.thread_rank(), - storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, - x_stride, - [](ValueType& result, const ValueType& out) { result = out; }); -} - - -template -__global__ void __launch_bounds__(warps_per_block* config::warp_size) - adaptive_apply(const ValueType* __restrict__ blocks, - preconditioner::block_interleaved_storage_scheme - storage_scheme, - const precision_reduction* __restrict__ block_precisions, - const IndexType* __restrict__ block_ptrs, - size_type num_blocks, const ValueType* __restrict__ b, - int32 b_stride, ValueType* __restrict__ x, int32 x_stride) -{ - const auto block_id = - thread::get_subwarp_id(); - const auto subwarp = - group::tiled_partition(group::this_thread_block()); - if (block_id >= num_blocks) { - return; - } - const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; - ValueType v = zero(); - if (subwarp.thread_rank() < block_size) { - v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; - } - GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( - ValueType, block_precisions[block_id], - multiply_vec( - subwarp, block_size, v, - reinterpret_cast( - blocks + storage_scheme.get_group_offset(block_id)) + - storage_scheme.get_block_offset(block_id) + - subwarp.thread_rank(), - storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, - x_stride, - [](ValueType& result, const ValueType& out) { result = out; })); -} - - -} // namespace kernel diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp similarity index 52% rename from cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu rename to common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp index d51b63487fe..734385970e3 100644 --- a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu +++ b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp @@ -21,16 +21,80 @@ namespace gko { namespace kernels { -namespace cuda { -/** - * @brief The Jacobi preconditioner namespace. - * @ref Jacobi - * @ingroup jacobi - */ +namespace GKO_DEVICE_NAMESPACE { namespace jacobi { +namespace kernel { -#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc" +template +__global__ void __launch_bounds__(warps_per_block* config::warp_size) apply( + const ValueType* __restrict__ blocks, + preconditioner::block_interleaved_storage_scheme storage_scheme, + const IndexType* __restrict__ block_ptrs, size_type num_blocks, + const ValueType* __restrict__ b, int32 b_stride, ValueType* __restrict__ x, + int32 x_stride) +{ + const auto block_id = + thread::get_subwarp_id(); + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + if (block_id >= num_blocks) { + return; + } + const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; + ValueType v = zero(); + if (subwarp.thread_rank() < block_size) { + v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; + } + multiply_vec( + subwarp, block_size, v, + blocks + storage_scheme.get_global_block_offset(block_id) + + subwarp.thread_rank(), + storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, + x_stride, + [](ValueType& result, const ValueType& out) { result = out; }); +} + + +template +__global__ void __launch_bounds__(warps_per_block* config::warp_size) + adaptive_apply(const ValueType* __restrict__ blocks, + preconditioner::block_interleaved_storage_scheme + storage_scheme, + const precision_reduction* __restrict__ block_precisions, + const IndexType* __restrict__ block_ptrs, + size_type num_blocks, const ValueType* __restrict__ b, + int32 b_stride, ValueType* __restrict__ x, int32 x_stride) +{ + const auto block_id = + thread::get_subwarp_id(); + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + if (block_id >= num_blocks) { + return; + } + const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; + ValueType v = zero(); + if (subwarp.thread_rank() < block_size) { + v = b[(block_ptrs[block_id] + subwarp.thread_rank()) * b_stride]; + } + GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( + ValueType, block_precisions[block_id], + multiply_vec( + subwarp, block_size, v, + reinterpret_cast( + blocks + storage_scheme.get_group_offset(block_id)) + + storage_scheme.get_block_offset(block_id) + + subwarp.thread_rank(), + storage_scheme.get_stride(), x + block_ptrs[block_id] * x_stride, + x_stride, + [](ValueType& result, const ValueType& out) { result = out; })); +} + + +} // namespace kernel // clang-format off @@ -92,6 +156,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace jacobi -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index c9bb448b79b..92b48518e7c 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -24,9 +24,6 @@ target_sources(ginkgo_cuda ${FBCSR_INSTANTIATE} matrix/fft_kernels.cu preconditioner/batch_jacobi_kernels.cu - preconditioner/jacobi_advanced_apply_kernels.cu - preconditioner/jacobi_generate_kernels.cu - preconditioner/jacobi_simple_apply_kernels.cu solver/batch_bicgstab_kernels.cu solver/batch_cg_kernels.cu solver/lower_trs_kernels.cu @@ -34,10 +31,6 @@ target_sources(ginkgo_cuda ${GKO_UNIFIED_COMMON_SOURCES} ${GKO_CUDA_HIP_COMMON_SOURCES} ) -# override the default language mapping for the common files, set them to CUDA -foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES FBCSR_INSTANTIATE) - set_source_files_properties(${source_file} PROPERTIES LANGUAGE CUDA) -endforeach(source_file) if(GINKGO_JACOBI_FULL_OPTIMIZATIONS) set(GKO_CUDA_JACOBI_BLOCK_SIZES) foreach(blocksize RANGE 1 32) @@ -46,25 +39,14 @@ if(GINKGO_JACOBI_FULL_OPTIMIZATIONS) else() set(GKO_CUDA_JACOBI_BLOCK_SIZES 1 2 4 8 13 16 32) endif() -set(GKO_CUDA_JACOBI_SOURCES) -foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_CUDA_JACOBI_BLOCK_SIZES) - configure_file( - preconditioner/jacobi_generate_kernels.instantiate.cu - preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) - configure_file( - preconditioner/jacobi_simple_apply_kernels.instantiate.cu - preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) - configure_file( - preconditioner/jacobi_advanced_apply_kernels.instantiate.cu - preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) - list(APPEND GKO_CUDA_JACOBI_SOURCES - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) -endforeach() +jacobi_generated_files(GKO_CUDA_JACOBI_SOURCES "${GKO_CUDA_JACOBI_BLOCK_SIZES}") +# override the default language mapping for the common files, set them to CUDA +foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES GKO_CUDA_JACOBI_SOURCES FBCSR_INSTANTIATE) + set_source_files_properties(${source_file} PROPERTIES LANGUAGE CUDA) +endforeach(source_file) target_sources(ginkgo_cuda PRIVATE ${GKO_CUDA_JACOBI_SOURCES}) -string(REPLACE ";" "," GKO_CUDA_JACOBI_BLOCK_SIZES_CODE "${GKO_CUDA_JACOBI_BLOCK_SIZES}") -configure_file(preconditioner/jacobi_common.hpp.in common/cuda_hip/preconditioner/jacobi_common.hpp) +string(REPLACE ";" "," GKO_JACOBI_BLOCK_SIZES_CODE "${GKO_CUDA_JACOBI_BLOCK_SIZES}") +configure_file(${Ginkgo_SOURCE_DIR}/common/cuda_hip/preconditioner/jacobi_common.hpp.in common/cuda_hip/preconditioner/jacobi_common.hpp) if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") # remove false positive CUDA warnings when calling one() and zero() diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.cu deleted file mode 100644 index a37296abf40..00000000000 --- a/cuda/preconditioner/jacobi_advanced_apply_kernels.cu +++ /dev/null @@ -1,75 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include - -#include "core/matrix/dense_kernels.hpp" -#include "core/preconditioner/jacobi_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -// generated header -#include "common/cuda_hip/preconditioner/jacobi_common.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Jacobi preconditioner namespace. - * @ref Jacobi - * @ingroup jacobi - */ -namespace jacobi { - - -template -void advanced_apply( - syn::value_list, - std::shared_ptr exec, size_type num_blocks, - const precision_reduction* block_precisions, - const IndexType* block_pointers, const ValueType* blocks, - const preconditioner::block_interleaved_storage_scheme& - storage_scheme, - const ValueType* alpha, const ValueType* b, size_type b_stride, - ValueType* x, size_type x_stride); - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_advanced_apply, advanced_apply); - - -template -void apply(std::shared_ptr exec, size_type num_blocks, - uint32 max_block_size, - const preconditioner::block_interleaved_storage_scheme& - storage_scheme, - const array& block_precisions, - const array& block_pointers, - const array& blocks, - const matrix::Dense* alpha, - const matrix::Dense* b, - const matrix::Dense* beta, matrix::Dense* x) -{ - // TODO: write a special kernel for multiple RHS - dense::scale(exec, beta, x); - for (size_type col = 0; col < b->get_size()[1]; ++col) { - select_advanced_apply( - compiled_kernels(), - [&](int compiled_block_size) { - return max_block_size <= compiled_block_size; - }, - syn::value_list(), - syn::type_list<>(), exec, num_blocks, - block_precisions.get_const_data(), block_pointers.get_const_data(), - blocks.get_const_data(), storage_scheme, alpha->get_const_values(), - b->get_const_values() + col, b->get_stride(), x->get_values() + col, - x->get_stride()); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL); - - -} // namespace jacobi -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu deleted file mode 100644 index fcf238d038f..00000000000 --- a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu +++ /dev/null @@ -1,100 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" -#include "core/base/extended_float.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "core/preconditioner/jacobi_kernels.hpp" -#include "core/preconditioner/jacobi_utils.hpp" -#include "core/synthesizer/implementation_selection.hpp" -// generated header -#include "common/cuda_hip/preconditioner/jacobi_common.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Jacobi preconditioner namespace. - * @ref Jacobi - * @ingroup jacobi - */ -namespace jacobi { - - -#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc" - - -// clang-format off -#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ -// clang-format on -// make things easier for IDEs -#ifndef GKO_JACOBI_BLOCK_SIZE -#define GKO_JACOBI_BLOCK_SIZE 1 -#endif - - -template -void advanced_apply( - syn::value_list, - std::shared_ptr exec, size_type num_blocks, - const precision_reduction* block_precisions, - const IndexType* block_pointers, const ValueType* blocks, - const preconditioner::block_interleaved_storage_scheme& - storage_scheme, - const ValueType* alpha, const ValueType* b, size_type b_stride, - ValueType* x, size_type x_stride) -{ - constexpr int subwarp_size = get_larger_power(max_block_size); - constexpr int blocks_per_warp = config::warp_size / subwarp_size; - const auto grid_size = - ceildiv(num_blocks, warps_per_block * blocks_per_warp); - const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - - if (grid_size > 0) { - if (block_precisions) { - kernel::advanced_adaptive_apply - <<get_stream()>>>( - as_device_type(blocks), storage_scheme, block_precisions, - block_pointers, num_blocks, as_device_type(alpha), - as_device_type(b), b_stride, as_device_type(x), x_stride); - } else { - kernel::advanced_apply - <<get_stream()>>>( - as_device_type(blocks), storage_scheme, block_pointers, - num_blocks, as_device_type(alpha), as_device_type(b), - b_stride, as_device_type(x), x_stride); - } - } -} - - -#define DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION(ValueType, IndexType) \ - void advanced_apply( \ - syn::value_list, \ - std::shared_ptr exec, size_type, \ - const precision_reduction*, const IndexType* block_pointers, \ - const ValueType*, \ - const preconditioner::block_interleaved_storage_scheme&, \ - const ValueType*, const ValueType*, size_type, ValueType*, size_type) - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION); - - -} // namespace jacobi -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/preconditioner/jacobi_generate_kernels.cu b/cuda/preconditioner/jacobi_generate_kernels.cu deleted file mode 100644 index d51f1947b7a..00000000000 --- a/cuda/preconditioner/jacobi_generate_kernels.cu +++ /dev/null @@ -1,72 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include -#include - -#include "core/components/fill_array_kernels.hpp" -#include "core/preconditioner/jacobi_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -// generated header -#include "common/cuda_hip/preconditioner/jacobi_common.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Jacobi preconditioner namespace. - * @ref Jacobi - * @ingroup jacobi - */ -namespace jacobi { - - -template -void generate(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* mtx, - remove_complex accuracy, ValueType* block_data, - const preconditioner::block_interleaved_storage_scheme& - storage_scheme, - remove_complex* conditioning, - precision_reduction* block_precisions, - const IndexType* block_ptrs, size_type num_blocks); - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generate, generate); - - -template -void generate(std::shared_ptr exec, - const matrix::Csr* system_matrix, - size_type num_blocks, uint32 max_block_size, - remove_complex accuracy, - const preconditioner::block_interleaved_storage_scheme& - storage_scheme, - array>& conditioning, - array& block_precisions, - const array& block_pointers, array& blocks) -{ - components::fill_array(exec, blocks.get_data(), blocks.get_size(), - zero()); - select_generate( - compiled_kernels(), - [&](int compiled_block_size) { - return max_block_size <= compiled_block_size; - }, - syn::value_list(), syn::type_list<>(), - exec, system_matrix, accuracy, blocks.get_data(), storage_scheme, - conditioning.get_data(), block_precisions.get_data(), - block_pointers.get_const_data(), num_blocks); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_JACOBI_GENERATE_KERNEL); - - -} // namespace jacobi -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu deleted file mode 100644 index aa8807728a8..00000000000 --- a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu +++ /dev/null @@ -1,108 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/diagonal_block_manipulation.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" -#include "core/base/extended_float.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/preconditioner/jacobi_kernels.hpp" -#include "core/preconditioner/jacobi_utils.hpp" -#include "core/synthesizer/implementation_selection.hpp" -// generated header -#include "common/cuda_hip/preconditioner/jacobi_common.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Jacobi preconditioner namespace. - * @ref Jacobi - * @ingroup jacobi - */ -namespace jacobi { - - -#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc" - - -// clang-format off -#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ -// clang-format on -// make things easier for IDEs -#ifndef GKO_JACOBI_BLOCK_SIZE -#define GKO_JACOBI_BLOCK_SIZE 1 -#endif - - -template -void generate(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* mtx, - remove_complex accuracy, ValueType* block_data, - const preconditioner::block_interleaved_storage_scheme& - storage_scheme, - remove_complex* conditioning, - precision_reduction* block_precisions, - const IndexType* block_ptrs, size_type num_blocks) -{ - constexpr int subwarp_size = get_larger_power(max_block_size); - constexpr int blocks_per_warp = config::warp_size / subwarp_size; - const auto grid_size = - ceildiv(num_blocks, warps_per_block * blocks_per_warp); - const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - - if (grid_size > 0) { - if (block_precisions) { - kernel::adaptive_generate - <<get_stream()>>>( - mtx->get_size()[0], mtx->get_const_row_ptrs(), - mtx->get_const_col_idxs(), - as_device_type(mtx->get_const_values()), - as_device_type(accuracy), as_device_type(block_data), - storage_scheme, as_device_type(conditioning), - block_precisions, block_ptrs, num_blocks); - } else { - kernel::generate - <<get_stream()>>>( - mtx->get_size()[0], mtx->get_const_row_ptrs(), - mtx->get_const_col_idxs(), - as_device_type(mtx->get_const_values()), - as_device_type(block_data), storage_scheme, block_ptrs, - num_blocks); - } - } -} - - -#define DECLARE_JACOBI_GENERATE_INSTANTIATION(ValueType, IndexType) \ - void generate( \ - syn::value_list, \ - std::shared_ptr exec, \ - const matrix::Csr*, remove_complex, \ - ValueType*, \ - const preconditioner::block_interleaved_storage_scheme&, \ - remove_complex*, precision_reduction*, const IndexType*, \ - size_type) - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - DECLARE_JACOBI_GENERATE_INSTANTIATION); - - -} // namespace jacobi -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index e6a337da7b9..67617169b5a 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -21,9 +21,6 @@ set(GINKGO_HIP_SOURCES ${CSR_INSTANTIATE} ${FBCSR_INSTANTIATE} preconditioner/batch_jacobi_kernels.hip.cpp - preconditioner/jacobi_advanced_apply_kernels.hip.cpp - preconditioner/jacobi_generate_kernels.hip.cpp - preconditioner/jacobi_simple_apply_kernels.hip.cpp solver/batch_bicgstab_kernels.hip.cpp solver/batch_cg_kernels.hip.cpp solver/lower_trs_kernels.hip.cpp @@ -48,33 +45,15 @@ else() set(GKO_HIP_JACOBI_BLOCK_SIZES 1 2 4 8 13 16 32 ${GKO_HIP_JACOBI_MAX_BLOCK_SIZE}) list(REMOVE_DUPLICATES GKO_HIP_JACOBI_BLOCK_SIZES) endif() -foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_HIP_JACOBI_BLOCK_SIZES) - configure_file( - preconditioner/jacobi_generate_kernels.instantiate.hip.cpp - preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) - configure_file( - preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp - preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) - configure_file( - preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp - preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) - # The 3D indexing used in Jacobi kernel triggers an instruction selection bug in Debug builds - # Probably the same as https://github.com/llvm/llvm-project/issues/67574 - # Fixed in ROCm 6.0 https://github.com/ROCm/llvm-project/commit/cd7f574a1fd1d3f3e8b9c1cae61fa8133a51de5f - # and in LLVM trunk https://github.com/llvm/llvm-project/commit/cc3d2533cc2e4ea06981b86ede5087fbf801e789 - set_source_files_properties( - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp - PROPERTIES - COMPILE_OPTIONS $<$:-O2>) - list(APPEND GINKGO_HIP_SOURCES - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) -endforeach() -string(REPLACE ";" "," GKO_HIP_JACOBI_BLOCK_SIZES_CODE "${GKO_HIP_JACOBI_BLOCK_SIZES}") -configure_file(preconditioner/jacobi_common.hip.hpp.in common/cuda_hip/preconditioner/jacobi_common.hpp) +jacobi_generated_files(GKO_HIP_JACOBI_SOURCES "${GKO_HIP_JACOBI_BLOCK_SIZES}") +# The 3D indexing used in Jacobi kernel triggers an instruction selection bug in Debug builds +# Probably the same as https://github.com/llvm/llvm-project/issues/67574 +# Fixed in ROCm 6.0 https://github.com/ROCm/llvm-project/commit/cd7f574a1fd1d3f3e8b9c1cae61fa8133a51de5f +# and in LLVM trunk https://github.com/llvm/llvm-project/commit/cc3d2533cc2e4ea06981b86ede5087fbf801e789 +set_source_files_properties(${GKO_HIP_JACOBI_SOURCES} PROPERTIES COMPILE_OPTIONS $<$:-O2>) +list(APPEND GINKGO_HIP_SOURCES ${GKO_HIP_JACOBI_SOURCES}) +string(REPLACE ";" "," GKO_JACOBI_BLOCK_SIZES_CODE "${GKO_HIP_JACOBI_BLOCK_SIZES}") +configure_file(${Ginkgo_SOURCE_DIR}/common/cuda_hip/preconditioner/jacobi_common.hpp.in common/cuda_hip/preconditioner/jacobi_common.hpp) set_source_files_properties(${GINKGO_HIP_SOURCES} PROPERTIES LANGUAGE HIP) add_library(ginkgo_hip $ ${GINKGO_HIP_SOURCES}) diff --git a/hip/preconditioner/jacobi_common.hip.hpp.in b/hip/preconditioner/jacobi_common.hip.hpp.in deleted file mode 100644 index 2185e124db6..00000000000 --- a/hip/preconditioner/jacobi_common.hip.hpp.in +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include -#include - - -#include "common/cuda_hip/base/config.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace jacobi { - - -/** - * A compile-time list of block sizes for which dedicated generate and apply - * kernels should be compiled. - */ -// clang-format off -#cmakedefine GKO_HIP_JACOBI_BLOCK_SIZES_CODE @GKO_HIP_JACOBI_BLOCK_SIZES_CODE@ -// clang-format on -// make things easier for IDEs -#ifndef GKO_HIP_JACOBI_BLOCK_SIZES_CODE -#define GKO_HIP_JACOBI_BLOCK_SIZES_CODE 1 -#endif - - -using compiled_kernels = syn::value_list; - - -constexpr int get_larger_power(int value, int guess = 1) -{ - return guess >= value ? guess : get_larger_power(value, guess << 1); -} - - -} // namespace jacobi -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp deleted file mode 100644 index 698efe6a858..00000000000 --- a/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp +++ /dev/null @@ -1,108 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/diagonal_block_manipulation.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" -#include "core/base/extended_float.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/preconditioner/jacobi_kernels.hpp" -#include "core/preconditioner/jacobi_utils.hpp" -#include "core/synthesizer/implementation_selection.hpp" -// generated header -#include "common/cuda_hip/preconditioner/jacobi_common.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Jacobi preconditioner namespace. - * @ref Jacobi - * @ingroup jacobi - */ -namespace jacobi { - - -#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc" - - -// clang-format off -#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ -// clang-format on -// make things easier for IDEs -#ifndef GKO_JACOBI_BLOCK_SIZE -#define GKO_JACOBI_BLOCK_SIZE 1 -#endif - - -template -void generate(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* mtx, - remove_complex accuracy, ValueType* block_data, - const preconditioner::block_interleaved_storage_scheme& - storage_scheme, - remove_complex* conditioning, - precision_reduction* block_precisions, - const IndexType* block_ptrs, size_type num_blocks) -{ - constexpr int subwarp_size = get_larger_power(max_block_size); - constexpr int blocks_per_warp = config::warp_size / subwarp_size; - const auto grid_size = - ceildiv(num_blocks, warps_per_block * blocks_per_warp); - const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - - if (grid_size > 0) { - if (block_precisions) { - kernel::adaptive_generate - <<get_stream()>>>( - mtx->get_size()[0], mtx->get_const_row_ptrs(), - mtx->get_const_col_idxs(), - as_device_type(mtx->get_const_values()), - as_device_type(accuracy), as_device_type(block_data), - storage_scheme, as_device_type(conditioning), - block_precisions, block_ptrs, num_blocks); - } else { - kernel::generate - <<get_stream()>>>( - mtx->get_size()[0], mtx->get_const_row_ptrs(), - mtx->get_const_col_idxs(), - as_device_type(mtx->get_const_values()), - as_device_type(block_data), storage_scheme, block_ptrs, - num_blocks); - } - } -} - - -#define DECLARE_JACOBI_GENERATE_INSTANTIATION(ValueType, IndexType) \ - void generate( \ - syn::value_list, \ - std::shared_ptr exec, \ - const matrix::Csr*, remove_complex, \ - ValueType*, \ - const preconditioner::block_interleaved_storage_scheme&, \ - remove_complex*, precision_reduction*, const IndexType*, \ - size_type) - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - DECLARE_JACOBI_GENERATE_INSTANTIATION); - - -} // namespace jacobi -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp deleted file mode 100644 index 16ca805a42c..00000000000 --- a/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp +++ /dev/null @@ -1,84 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" -#include "core/base/extended_float.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "core/preconditioner/jacobi_kernels.hpp" -#include "core/preconditioner/jacobi_utils.hpp" -#include "core/synthesizer/implementation_selection.hpp" -// generated header -#include "common/cuda_hip/preconditioner/jacobi_common.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Jacobi preconditioner namespace. - * @ref Jacobi - * @ingroup jacobi - */ -namespace jacobi { - - -#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc" - - -template -void apply(syn::value_list, - std::shared_ptr exec, size_type num_blocks, - const precision_reduction* block_precisions, - const IndexType* block_pointers, const ValueType* blocks, - const preconditioner::block_interleaved_storage_scheme& - storage_scheme, - const ValueType* b, size_type b_stride, ValueType* x, - size_type x_stride); - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_apply, apply); - - -template -void simple_apply( - std::shared_ptr exec, size_type num_blocks, - uint32 max_block_size, - const preconditioner::block_interleaved_storage_scheme& - storage_scheme, - const array& block_precisions, - const array& block_pointers, const array& blocks, - const matrix::Dense* b, matrix::Dense* x) -{ - // TODO: write a special kernel for multiple RHS - for (size_type col = 0; col < b->get_size()[1]; ++col) { - select_apply( - compiled_kernels(), - [&](int compiled_block_size) { - return max_block_size <= compiled_block_size; - }, - syn::value_list(), - syn::type_list<>(), exec, num_blocks, - block_precisions.get_const_data(), block_pointers.get_const_data(), - blocks.get_const_data(), storage_scheme, - b->get_const_values() + col, b->get_stride(), x->get_values() + col, - x->get_stride()); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL); - - -} // namespace jacobi -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp deleted file mode 100644 index d666a698b5e..00000000000 --- a/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp +++ /dev/null @@ -1,97 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" -#include "core/base/extended_float.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "core/preconditioner/jacobi_kernels.hpp" -#include "core/preconditioner/jacobi_utils.hpp" -#include "core/synthesizer/implementation_selection.hpp" -// generated header -#include "common/cuda_hip/preconditioner/jacobi_common.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Jacobi preconditioner namespace. - * @ref Jacobi - * @ingroup jacobi - */ -namespace jacobi { - - -#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc" - - -// clang-format off -#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ -// clang-format on -// make things easier for IDEs -#ifndef GKO_JACOBI_BLOCK_SIZE -#define GKO_JACOBI_BLOCK_SIZE 1 -#endif - - -template -void apply(syn::value_list, - std::shared_ptr exec, size_type num_blocks, - const precision_reduction* block_precisions, - const IndexType* block_pointers, const ValueType* blocks, - const preconditioner::block_interleaved_storage_scheme& - storage_scheme, - const ValueType* b, size_type b_stride, ValueType* x, - size_type x_stride) -{ - constexpr int subwarp_size = get_larger_power(max_block_size); - constexpr int blocks_per_warp = config::warp_size / subwarp_size; - const auto grid_size = - ceildiv(num_blocks, warps_per_block * blocks_per_warp); - const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - - if (grid_size > 0) { - if (block_precisions) { - kernel::adaptive_apply - <<get_stream()>>>( - as_device_type(blocks), storage_scheme, block_precisions, - block_pointers, num_blocks, as_device_type(b), b_stride, - as_device_type(x), x_stride); - } else { - kernel::apply - <<get_stream()>>>( - as_device_type(blocks), storage_scheme, block_pointers, - num_blocks, as_device_type(b), b_stride, as_device_type(x), - x_stride); - } - } -} - - -#define DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION(ValueType, IndexType) \ - void apply( \ - syn::value_list, \ - std::shared_ptr exec, size_type, \ - const precision_reduction*, const IndexType*, const ValueType*, \ - const preconditioner::block_interleaved_storage_scheme&, \ - const ValueType*, size_type, ValueType*, size_type) - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION); - - -} // namespace jacobi -} // namespace hip -} // namespace kernels -} // namespace gko From 068dc49a7ce35e43ddc0a6b2fc235381a4bbcc88 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 13 Jul 2024 00:35:04 +0200 Subject: [PATCH 091/448] preparation --- cuda/matrix/csr_kernels.template.cu | 8 +++--- hip/matrix/csr_kernels.template.hip.cpp | 33 +++++++++++++------------ 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu index c8d193e09af..89e5de9c303 100644 --- a/cuda/matrix/csr_kernels.template.cu +++ b/cuda/matrix/csr_kernels.template.cu @@ -27,6 +27,7 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" @@ -54,7 +55,7 @@ namespace gko { namespace kernels { -namespace cuda { +namespace GKO_DEVICE_NAMESPACE { /** * @brief The Compressed sparse row matrix format namespace. * @@ -224,6 +225,7 @@ void classical_spmv(syn::value_list, { using arithmetic_type = highest_precision; + const auto nwarps = exec->get_num_warps_per_sm() * exec->get_num_multiprocessor() * classical_oversubscription; @@ -488,7 +490,7 @@ void spmv(std::shared_ptr exec, a->get_strategy())) { max_length_per_row = strategy->get_max_length_per_row(); } else { - // as a fall-back: use average row length + // as a fall-back: use average row length, at least 1 max_length_per_row = a->get_num_stored_elements() / std::max(a->get_size()[0], 1); } @@ -995,6 +997,6 @@ void sort_by_column_index(std::shared_ptr exec, } // namespace csr -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp index 473361029c8..1fb086c5ea6 100644 --- a/hip/matrix/csr_kernels.template.hip.cpp +++ b/hip/matrix/csr_kernels.template.hip.cpp @@ -33,6 +33,7 @@ #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/atomic.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" #include "common/cuda_hip/components/intrinsics.hpp" #include "common/cuda_hip/components/merging.hpp" #include "common/cuda_hip/components/prefix_sum.hpp" @@ -54,7 +55,7 @@ namespace gko { namespace kernels { -namespace hip { +namespace GKO_DEVICE_NAMESPACE { /** * @brief The Compressed sparse row matrix format namespace. * @@ -93,7 +94,7 @@ namespace { template void merge_path_spmv(syn::value_list, - std::shared_ptr exec, + std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, matrix::Dense* c, @@ -174,7 +175,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_merge_path_spmv, merge_path_spmv); template -int compute_items_per_thread(std::shared_ptr exec) +int compute_items_per_thread(std::shared_ptr exec) { #if GINKGO_HIP_PLATFORM_NVCC @@ -231,7 +232,7 @@ int compute_items_per_thread(std::shared_ptr exec) template void classical_spmv(syn::value_list, - std::shared_ptr exec, + std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, matrix::Dense* c, @@ -285,7 +286,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv); template -void load_balance_spmv(std::shared_ptr exec, +void load_balance_spmv(std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, matrix::Dense* c, @@ -336,7 +337,7 @@ void load_balance_spmv(std::shared_ptr exec, template -bool try_general_sparselib_spmv(std::shared_ptr exec, +bool try_general_sparselib_spmv(std::shared_ptr exec, const ValueType* alpha, const matrix::Csr* a, const matrix::Dense* b, @@ -371,7 +372,7 @@ template ::value || !std::is_same::value>> -bool try_sparselib_spmv(std::shared_ptr exec, +bool try_sparselib_spmv(std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, matrix::Dense* c, @@ -383,7 +384,7 @@ bool try_sparselib_spmv(std::shared_ptr exec, } template -bool try_sparselib_spmv(std::shared_ptr exec, +bool try_sparselib_spmv(std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, matrix::Dense* c, @@ -409,7 +410,7 @@ bool try_sparselib_spmv(std::shared_ptr exec, template -void spmv(std::shared_ptr exec, +void spmv(std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, matrix::Dense* c) @@ -466,7 +467,7 @@ void spmv(std::shared_ptr exec, template -void advanced_spmv(std::shared_ptr exec, +void advanced_spmv(std::shared_ptr exec, const matrix::Dense* alpha, const matrix::Csr* a, const matrix::Dense* b, @@ -527,7 +528,7 @@ void advanced_spmv(std::shared_ptr exec, template -void spgemm(std::shared_ptr exec, +void spgemm(std::shared_ptr exec, const matrix::Csr* a, const matrix::Csr* b, matrix::Csr* c) @@ -600,7 +601,7 @@ void spgemm(std::shared_ptr exec, template -void advanced_spgemm(std::shared_ptr exec, +void advanced_spgemm(std::shared_ptr exec, const matrix::Dense* alpha, const matrix::Csr* a, const matrix::Csr* b, @@ -691,7 +692,7 @@ void advanced_spgemm(std::shared_ptr exec, template -void transpose(std::shared_ptr exec, +void transpose(std::shared_ptr exec, const matrix::Csr* orig, matrix::Csr* trans) { @@ -715,7 +716,7 @@ void transpose(std::shared_ptr exec, template -void conj_transpose(std::shared_ptr exec, +void conj_transpose(std::shared_ptr exec, const matrix::Csr* orig, matrix::Csr* trans) { @@ -747,7 +748,7 @@ void conj_transpose(std::shared_ptr exec, template -void sort_by_column_index(std::shared_ptr exec, +void sort_by_column_index(std::shared_ptr exec, matrix::Csr* to_sort) { if (sparselib::is_supported::value) { @@ -792,6 +793,6 @@ void sort_by_column_index(std::shared_ptr exec, } // namespace csr -} // namespace hip +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko From ed2d73c01a4a9f03995f390aeb01180074c75e94 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 14 Jul 2024 22:43:15 +0200 Subject: [PATCH 092/448] unify Csr, remove CUDA 10.x ifdefs --- .../matrix/csr_kernels.instantiate.cpp | 6 +- ...rnels.hpp.inc => csr_kernels.template.cpp} | 1038 +++++++++++++++++ cuda/CMakeLists.txt | 4 +- cuda/matrix/csr_kernels.instantiate.cu | 81 -- cuda/matrix/csr_kernels.template.cu | 1002 ---------------- hip/CMakeLists.txt | 2 +- hip/matrix/csr_kernels.template.hip.cpp | 798 ------------- 7 files changed, 1044 insertions(+), 1887 deletions(-) rename hip/matrix/csr_kernels.instantiate.hip.cpp => common/cuda_hip/matrix/csr_kernels.instantiate.cpp (97%) rename common/cuda_hip/matrix/{csr_kernels.hpp.inc => csr_kernels.template.cpp} (62%) delete mode 100644 cuda/matrix/csr_kernels.instantiate.cu delete mode 100644 cuda/matrix/csr_kernels.template.cu delete mode 100644 hip/matrix/csr_kernels.template.hip.cpp diff --git a/hip/matrix/csr_kernels.instantiate.hip.cpp b/common/cuda_hip/matrix/csr_kernels.instantiate.cpp similarity index 97% rename from hip/matrix/csr_kernels.instantiate.hip.cpp rename to common/cuda_hip/matrix/csr_kernels.instantiate.cpp index 53a5a572aea..f62ca1c1815 100644 --- a/hip/matrix/csr_kernels.instantiate.hip.cpp +++ b/common/cuda_hip/matrix/csr_kernels.instantiate.cpp @@ -2,12 +2,12 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "hip/matrix/csr_kernels.template.hip.cpp" +#include "common/cuda_hip/matrix/csr_kernels.template.cpp" namespace gko { namespace kernels { -namespace hip { +namespace GKO_DEVICE_NAMESPACE { /** * @brief The Compressed sparse row matrix format namespace. * @@ -124,6 +124,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace csr -} // namespace hip +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.template.cpp similarity index 62% rename from common/cuda_hip/matrix/csr_kernels.hpp.inc rename to common/cuda_hip/matrix/csr_kernels.template.cpp index 85b98f15825..eda0e856b07 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.template.cpp @@ -2,6 +2,88 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/matrix/csr_kernels.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "accessor/cuda_hip_helper.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/array_access.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/csr_accessor_helper.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_lookup.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Compressed sparse row matrix format namespace. + * + * @ingroup csr + */ +namespace csr { + + +constexpr int default_block_size = 512; +constexpr int warps_in_block = 4; +constexpr int spmv_block_size = warps_in_block * config::warp_size; +constexpr int classical_oversubscription = 32; + + +/** + * A compile-time list of the number items per threads for which spmv kernel + * should be compiled. + */ +using compiled_kernels = syn::value_list; + +using classical_kernels = + syn::value_list; + +using spgeam_kernels = + syn::value_list; + + +#include "common/cuda_hip/matrix/csr_common.hpp.inc" namespace kernel { @@ -1779,3 +1861,959 @@ void add_scaled_identity(std::shared_ptr exec, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), as_device_type(mtx->get_values())); } + + +namespace host_kernel { +namespace { + + +template +void merge_path_spmv(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* a, + const matrix::Dense* b, + matrix::Dense* c, + const matrix::Dense* alpha = nullptr, + const matrix::Dense* beta = nullptr) +{ + using arithmetic_type = + highest_precision; + const IndexType total = a->get_size()[0] + a->get_num_stored_elements(); + const IndexType grid_num = + ceildiv(total, spmv_block_size * items_per_thread); + const auto grid = grid_num; + const auto block = spmv_block_size; + // TODO: workspace? + array row_out(exec, grid_num); + // TODO: should we store the value in arithmetic_type or output_type? + array val_out(exec, grid_num); + + const auto a_vals = + acc::helper::build_const_rrm_accessor(a); + + for (IndexType column_id = 0; column_id < b->get_size()[1]; column_id++) { + const auto column_span = + acc::index_span(static_cast(column_id), + static_cast(column_id + 1)); + const auto b_vals = + acc::helper::build_const_rrm_accessor(b, + column_span); + auto c_vals = + acc::helper::build_rrm_accessor(c, column_span); + if (alpha == nullptr && beta == nullptr) { + if (grid_num > 0) { + kernel::abstract_merge_path_spmv + <<get_stream()>>>( + static_cast(a->get_size()[0]), + acc::as_device_range(a_vals), a->get_const_col_idxs(), + as_device_type(a->get_const_row_ptrs()), + as_device_type(a->get_const_srow()), + acc::as_device_range(b_vals), + acc::as_device_range(c_vals), + as_device_type(row_out.get_data()), + as_device_type(val_out.get_data())); + } + kernel:: + abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>( + grid_num, as_device_type(val_out.get_data()), + as_device_type(row_out.get_data()), + acc::as_device_range(c_vals)); + + } else if (alpha != nullptr && beta != nullptr) { + if (grid_num > 0) { + kernel::abstract_merge_path_spmv + <<get_stream()>>>( + static_cast(a->get_size()[0]), + as_device_type(alpha->get_const_values()), + acc::as_device_range(a_vals), a->get_const_col_idxs(), + as_device_type(a->get_const_row_ptrs()), + as_device_type(a->get_const_srow()), + acc::as_device_range(b_vals), + as_device_type(beta->get_const_values()), + acc::as_device_range(c_vals), + as_device_type(row_out.get_data()), + as_device_type(val_out.get_data())); + } + kernel:: + abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>( + grid_num, as_device_type(val_out.get_data()), + as_device_type(row_out.get_data()), + as_device_type(alpha->get_const_values()), + acc::as_device_range(c_vals)); + } else { + GKO_KERNEL_NOT_FOUND; + } + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_merge_path_spmv, merge_path_spmv); + + +template +int compute_items_per_thread(std::shared_ptr exec) +{ +#if defined(GKO_COMPILING_CUDA) || GINKGO_HIP_PLATFORM_NVCC + + + const int version = + (exec->get_major_version() << 4) + exec->get_minor_version(); + // The num_item is decided to make the occupancy 100% + // TODO: Extend this list when new GPU is released + // Tune this parameter + // 128 threads/block the number of items per threads + // 3.0 3.5: 6 + // 3.7: 14 + // 5.0, 5.3, 6.0, 6.2: 8 + // 5.2, 6.1, 7.0: 12 + int num_item = 6; + switch (version) { + case 0x50: + case 0x53: + case 0x60: + case 0x62: + num_item = 8; + break; + case 0x52: + case 0x61: + case 0x70: + num_item = 12; + break; + case 0x37: + num_item = 14; + } + + +#else + + + // HIP uses the minimal num_item to make the code work correctly. + // TODO: this parameter should be tuned. + int num_item = 6; + + +#endif // GINKGO_HIP_PLATFORM_NVCC + + + // Ensure that the following is satisfied: + // sizeof(IndexType) + sizeof(ValueType) + // <= items_per_thread * sizeof(IndexType) + constexpr int minimal_num = + ceildiv(sizeof(IndexType) + sizeof(ValueType), sizeof(IndexType)); + int items_per_thread = num_item * 4 / sizeof(IndexType); + return std::max(minimal_num, items_per_thread); +} + + +template +void classical_spmv(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* a, + const matrix::Dense* b, + matrix::Dense* c, + const matrix::Dense* alpha = nullptr, + const matrix::Dense* beta = nullptr) +{ + using arithmetic_type = + highest_precision; + + const auto nwarps = exec->get_num_warps_per_sm() * + exec->get_num_multiprocessor() * + classical_oversubscription; + const auto gridx = + std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size), + int64(nwarps / warps_in_block)); + const dim3 grid(gridx, b->get_size()[1]); + const auto block = spmv_block_size; + + const auto a_vals = + acc::helper::build_const_rrm_accessor(a); + const auto b_vals = + acc::helper::build_const_rrm_accessor(b); + auto c_vals = acc::helper::build_rrm_accessor(c); + if (alpha == nullptr && beta == nullptr) { + if (grid.x > 0 && grid.y > 0) { + kernel::abstract_classical_spmv + <<get_stream()>>>( + a->get_size()[0], acc::as_device_range(a_vals), + a->get_const_col_idxs(), + as_device_type(a->get_const_row_ptrs()), + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); + } + } else if (alpha != nullptr && beta != nullptr) { + if (grid.x > 0 && grid.y > 0) { + kernel::abstract_classical_spmv + <<get_stream()>>>( + a->get_size()[0], as_device_type(alpha->get_const_values()), + acc::as_device_range(a_vals), a->get_const_col_idxs(), + as_device_type(a->get_const_row_ptrs()), + acc::as_device_range(b_vals), + as_device_type(beta->get_const_values()), + acc::as_device_range(c_vals)); + } + } else { + GKO_KERNEL_NOT_FOUND; + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv); + + +template +void load_balance_spmv(std::shared_ptr exec, + const matrix::Csr* a, + const matrix::Dense* b, + matrix::Dense* c, + const matrix::Dense* alpha = nullptr, + const matrix::Dense* beta = nullptr) +{ + using arithmetic_type = + highest_precision; + + if (beta) { + dense::scale(exec, beta, c); + } else { + dense::fill(exec, c, zero()); + } + const IndexType nwarps = a->get_num_srow_elements(); + if (nwarps > 0) { + const dim3 csr_block(config::warp_size, warps_in_block, 1); + const dim3 csr_grid(ceildiv(nwarps, warps_in_block), b->get_size()[1]); + const auto a_vals = + acc::helper::build_const_rrm_accessor(a); + const auto b_vals = + acc::helper::build_const_rrm_accessor(b); + auto c_vals = acc::helper::build_rrm_accessor(c); + if (alpha) { + if (csr_grid.x > 0 && csr_grid.y > 0) { + kernel::abstract_spmv<<get_stream()>>>( + nwarps, static_cast(a->get_size()[0]), + as_device_type(alpha->get_const_values()), + acc::as_device_range(a_vals), a->get_const_col_idxs(), + as_device_type(a->get_const_row_ptrs()), + as_device_type(a->get_const_srow()), + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); + } + } else { + if (csr_grid.x > 0 && csr_grid.y > 0) { + kernel::abstract_spmv<<get_stream()>>>( + nwarps, static_cast(a->get_size()[0]), + acc::as_device_range(a_vals), a->get_const_col_idxs(), + as_device_type(a->get_const_row_ptrs()), + as_device_type(a->get_const_srow()), + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); + } + } + } +} + + +template +bool try_general_sparselib_spmv(std::shared_ptr exec, + const ValueType* alpha, + const matrix::Csr* a, + const matrix::Dense* b, + const ValueType* beta, + matrix::Dense* c) +{ +#ifdef GKO_COMPILING_HIP + bool try_sparselib = sparselib::is_supported::value; + try_sparselib = + try_sparselib && b->get_stride() == 1 && c->get_stride() == 1; + // rocSPARSE has issues with zero matrices + try_sparselib = try_sparselib && a->get_num_stored_elements() > 0; + if (try_sparselib) { + auto descr = sparselib::create_mat_descr(); + + auto row_ptrs = a->get_const_row_ptrs(); + auto col_idxs = a->get_const_col_idxs(); + + sparselib::spmv(exec->get_sparselib_handle(), + SPARSELIB_OPERATION_NON_TRANSPOSE, a->get_size()[0], + a->get_size()[1], a->get_num_stored_elements(), alpha, + descr, a->get_const_values(), row_ptrs, col_idxs, + b->get_const_values(), beta, c->get_values()); + + sparselib::destroy(descr); + } + return try_sparselib; +#else // GKO_COMPILING_CUDA + auto handle = exec->get_sparselib_handle(); + // workaround for a division by zero in cuSPARSE 11.? + if (a->get_size()[1] == 0) { + return false; + } + cusparseOperation_t trans = SPARSELIB_OPERATION_NON_TRANSPOSE; + auto row_ptrs = const_cast(a->get_const_row_ptrs()); + auto col_idxs = const_cast(a->get_const_col_idxs()); + auto values = const_cast(a->get_const_values()); + auto mat = sparselib::create_csr(a->get_size()[0], a->get_size()[1], + a->get_num_stored_elements(), row_ptrs, + col_idxs, values); + auto b_val = const_cast(b->get_const_values()); + auto c_val = c->get_values(); + if (b->get_stride() == 1 && c->get_stride() == 1) { + auto vecb = sparselib::create_dnvec(b->get_size()[0], b_val); + auto vecc = sparselib::create_dnvec(c->get_size()[0], c_val); +#if CUDA_VERSION >= 11021 + constexpr auto alg = CUSPARSE_SPMV_CSR_ALG1; +#else + constexpr auto alg = CUSPARSE_CSRMV_ALG1; +#endif + size_type buffer_size = 0; + sparselib::spmv_buffersize(handle, trans, alpha, mat, vecb, + beta, vecc, alg, &buffer_size); + + array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + sparselib::spmv(handle, trans, alpha, mat, vecb, beta, vecc, + alg, buffer); + sparselib::destroy(vecb); + sparselib::destroy(vecc); + } else { +#if CUDA_VERSION >= 11060 + if (b->get_size()[1] == 1) { + // cusparseSpMM seems to take the single strided vector as column + // major without considering stride and row major (cuda 11.6) + return false; + } +#endif // CUDA_VERSION >= 11060 + cusparseSpMMAlg_t alg = CUSPARSE_SPMM_CSR_ALG2; + auto vecb = + sparselib::create_dnmat(b->get_size(), b->get_stride(), b_val); + auto vecc = + sparselib::create_dnmat(c->get_size(), c->get_stride(), c_val); + size_type buffer_size = 0; + sparselib::spmm_buffersize(handle, trans, trans, alpha, mat, + vecb, beta, vecc, alg, + &buffer_size); + + array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + sparselib::spmm(handle, trans, trans, alpha, mat, vecb, beta, + vecc, alg, buffer); + sparselib::destroy(vecb); + sparselib::destroy(vecc); + } + sparselib::destroy(mat); + return true; +#endif // GKO_COMPILING_CUDA +} + + +template ::value || + !std::is_same::value>> +bool try_sparselib_spmv(std::shared_ptr exec, + const matrix::Csr* a, + const matrix::Dense* b, + matrix::Dense* c, + const matrix::Dense* alpha = nullptr, + const matrix::Dense* beta = nullptr) +{ + // TODO: support sparselib mixed + return false; +} + +template +bool try_sparselib_spmv(std::shared_ptr exec, + const matrix::Csr* a, + const matrix::Dense* b, + matrix::Dense* c, + const matrix::Dense* alpha = nullptr, + const matrix::Dense* beta = nullptr) +{ + if (alpha) { + return try_general_sparselib_spmv(exec, alpha->get_const_values(), a, b, + beta->get_const_values(), c); + } else { + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); + const auto valpha = one(); + const auto vbeta = zero(); + return try_general_sparselib_spmv(exec, &valpha, a, b, &vbeta, c); + } +} + + +} // anonymous namespace +} // namespace host_kernel + + +template +void spmv(std::shared_ptr exec, + const matrix::Csr* a, + const matrix::Dense* b, + matrix::Dense* c) +{ + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + } else if (a->get_strategy()->get_name() == "load_balance") { + host_kernel::load_balance_spmv(exec, a, b, c); + } else if (a->get_strategy()->get_name() == "merge_path") { + using arithmetic_type = + highest_precision; + int items_per_thread = + host_kernel::compute_items_per_thread( + exec); + host_kernel::select_merge_path_spmv( + compiled_kernels(), + [&items_per_thread](int compiled_info) { + return items_per_thread == compiled_info; + }, + syn::value_list(), syn::type_list<>(), exec, a, b, c); + } else { + bool use_classical = true; + if (a->get_strategy()->get_name() == "sparselib" || + a->get_strategy()->get_name() == "cusparse") { + use_classical = !host_kernel::try_sparselib_spmv(exec, a, b, c); + } + if (use_classical) { + IndexType max_length_per_row = 0; + using Tcsr = matrix::Csr; + if (auto strategy = + std::dynamic_pointer_cast( + a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else if (auto strategy = std::dynamic_pointer_cast< + const typename Tcsr::automatical>( + a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else { + // as a fall-back: use average row length, at least 1 + max_length_per_row = a->get_num_stored_elements() / + std::max(a->get_size()[0], 1); + } + max_length_per_row = std::max(max_length_per_row, 1); + host_kernel::select_classical_spmv( + classical_kernels(), + [&max_length_per_row](int compiled_info) { + return max_length_per_row >= compiled_info; + }, + syn::value_list(), syn::type_list<>(), exec, a, b, c); + } + } +} + + +template +void advanced_spmv(std::shared_ptr exec, + const matrix::Dense* alpha, + const matrix::Csr* a, + const matrix::Dense* b, + const matrix::Dense* beta, + matrix::Dense* c) +{ + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + } else if (a->get_strategy()->get_name() == "load_balance") { + host_kernel::load_balance_spmv(exec, a, b, c, alpha, beta); + } else if (a->get_strategy()->get_name() == "merge_path") { + using arithmetic_type = + highest_precision; + int items_per_thread = + host_kernel::compute_items_per_thread( + exec); + host_kernel::select_merge_path_spmv( + compiled_kernels(), + [&items_per_thread](int compiled_info) { + return items_per_thread == compiled_info; + }, + syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, + beta); + } else { + bool use_classical = true; + if (a->get_strategy()->get_name() == "sparselib" || + a->get_strategy()->get_name() == "cusparse") { + use_classical = + !host_kernel::try_sparselib_spmv(exec, a, b, c, alpha, beta); + } + if (use_classical) { + IndexType max_length_per_row = 0; + using Tcsr = matrix::Csr; + if (auto strategy = + std::dynamic_pointer_cast( + a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else if (auto strategy = std::dynamic_pointer_cast< + const typename Tcsr::automatical>( + a->get_strategy())) { + max_length_per_row = strategy->get_max_length_per_row(); + } else { + // as a fall-back: use average row length, at least 1 + max_length_per_row = a->get_num_stored_elements() / + std::max(a->get_size()[0], 1); + } + max_length_per_row = std::max(max_length_per_row, 1); + host_kernel::select_classical_spmv( + classical_kernels(), + [&max_length_per_row](int compiled_info) { + return max_length_per_row >= compiled_info; + }, + syn::value_list(), syn::type_list<>(), exec, a, b, c, + alpha, beta); + } + } +} + + +template +void spgemm(std::shared_ptr exec, + const matrix::Csr* a, + const matrix::Csr* b, + matrix::Csr* c) +{ +#ifdef GKO_COMPILING_HIP + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); + auto a_descr = sparselib::create_mat_descr(); + auto b_descr = sparselib::create_mat_descr(); + auto c_descr = sparselib::create_mat_descr(); + auto d_descr = sparselib::create_mat_descr(); + auto info = sparselib::create_spgemm_info(); + + auto alpha = one(); + auto a_nnz = static_cast(a->get_num_stored_elements()); + auto a_vals = a->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto b_nnz = static_cast(b->get_num_stored_elements()); + auto b_vals = b->get_const_values(); + auto b_row_ptrs = b->get_const_row_ptrs(); + auto b_col_idxs = b->get_const_col_idxs(); + auto null_value = static_cast(nullptr); + auto null_index = static_cast(nullptr); + auto zero_nnz = IndexType{}; + auto m = static_cast(a->get_size()[0]); + auto n = static_cast(b->get_size()[1]); + auto k = static_cast(a->get_size()[1]); + auto c_row_ptrs = c->get_row_ptrs(); + matrix::CsrBuilder c_builder{c}; + auto& c_col_idxs_array = c_builder.get_col_idx_array(); + auto& c_vals_array = c_builder.get_value_array(); + + // allocate buffer + size_type buffer_size{}; + sparselib::spgemm_buffer_size( + handle, m, n, k, &alpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs, + b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr, + zero_nnz, null_index, null_index, info, buffer_size); + array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + + // count nnz + IndexType c_nnz{}; + sparselib::spgemm_nnz( + handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr, + b_nnz, b_row_ptrs, b_col_idxs, d_descr, zero_nnz, null_index, + null_index, c_descr, c_row_ptrs, &c_nnz, info, buffer); + + // accumulate non-zeros + c_col_idxs_array.resize_and_reset(c_nnz); + c_vals_array.resize_and_reset(c_nnz); + auto c_col_idxs = c_col_idxs_array.get_data(); + auto c_vals = c_vals_array.get_data(); + sparselib::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals, + a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, + b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz, + null_value, null_index, null_index, c_descr, c_vals, + c_row_ptrs, c_col_idxs, info, buffer); + + sparselib::destroy_spgemm_info(info); + sparselib::destroy(d_descr); + sparselib::destroy(c_descr); + sparselib::destroy(b_descr); + sparselib::destroy(a_descr); + } else { + GKO_NOT_IMPLEMENTED; + } +#else // GKO_COMPILING_CUDA + auto a_vals = a->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto b_vals = b->get_const_values(); + auto b_row_ptrs = b->get_const_row_ptrs(); + auto b_col_idxs = b->get_const_col_idxs(); + auto c_row_ptrs = c->get_row_ptrs(); + + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); + + auto alpha = one(); + auto a_nnz = static_cast(a->get_num_stored_elements()); + auto b_nnz = static_cast(b->get_num_stored_elements()); + auto null_value = static_cast(nullptr); + auto null_index = static_cast(nullptr); + auto zero_nnz = IndexType{}; + auto m = IndexType(a->get_size()[0]); + auto n = IndexType(b->get_size()[1]); + auto k = IndexType(a->get_size()[1]); + matrix::CsrBuilder c_builder{c}; + auto& c_col_idxs_array = c_builder.get_col_idx_array(); + auto& c_vals_array = c_builder.get_value_array(); + + const auto beta = zero(); + auto spgemm_descr = sparselib::create_spgemm_descr(); + auto a_descr = sparselib::create_csr( + m, k, a_nnz, const_cast(a_row_ptrs), + const_cast(a_col_idxs), const_cast(a_vals)); + auto b_descr = sparselib::create_csr( + k, n, b_nnz, const_cast(b_row_ptrs), + const_cast(b_col_idxs), const_cast(b_vals)); + auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index, + null_value); + + // estimate work + size_type buffer1_size{}; + sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta, + c_descr, spgemm_descr, buffer1_size, + nullptr); + array buffer1{exec, buffer1_size}; + sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta, + c_descr, spgemm_descr, buffer1_size, + buffer1.get_data()); + + // compute spgemm + size_type buffer2_size{}; + sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr, + spgemm_descr, buffer1.get_data(), buffer2_size, + nullptr); + array buffer2{exec, buffer2_size}; + sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr, + spgemm_descr, buffer1.get_data(), buffer2_size, + buffer2.get_data()); + + // copy data to result + auto c_nnz = sparselib::sparse_matrix_nnz(c_descr); + c_col_idxs_array.resize_and_reset(c_nnz); + c_vals_array.resize_and_reset(c_nnz); + sparselib::csr_set_pointers(c_descr, c_row_ptrs, + c_col_idxs_array.get_data(), + c_vals_array.get_data()); + + sparselib::spgemm_copy(handle, &alpha, a_descr, b_descr, &beta, c_descr, + spgemm_descr); + + sparselib::destroy(c_descr); + sparselib::destroy(b_descr); + sparselib::destroy(a_descr); + sparselib::destroy(spgemm_descr); +#endif // GKO_COMPILING_CUDA +} + + +template +void advanced_spgemm(std::shared_ptr exec, + const matrix::Dense* alpha, + const matrix::Csr* a, + const matrix::Csr* b, + const matrix::Dense* beta, + const matrix::Csr* d, + matrix::Csr* c) +{ +#ifdef GKO_COMPILING_HIP + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); + auto a_descr = sparselib::create_mat_descr(); + auto b_descr = sparselib::create_mat_descr(); + auto c_descr = sparselib::create_mat_descr(); + auto d_descr = sparselib::create_mat_descr(); + auto info = sparselib::create_spgemm_info(); + + auto a_nnz = static_cast(a->get_num_stored_elements()); + auto a_vals = a->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto b_nnz = static_cast(b->get_num_stored_elements()); + auto b_vals = b->get_const_values(); + auto b_row_ptrs = b->get_const_row_ptrs(); + auto b_col_idxs = b->get_const_col_idxs(); + auto d_vals = d->get_const_values(); + auto d_row_ptrs = d->get_const_row_ptrs(); + auto d_col_idxs = d->get_const_col_idxs(); + auto null_value = static_cast(nullptr); + auto null_index = static_cast(nullptr); + auto one_value = one(); + auto m = static_cast(a->get_size()[0]); + auto n = static_cast(b->get_size()[1]); + auto k = static_cast(a->get_size()[1]); + + // allocate buffer + size_type buffer_size{}; + sparselib::spgemm_buffer_size( + handle, m, n, k, &one_value, a_descr, a_nnz, a_row_ptrs, a_col_idxs, + b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr, + IndexType{}, null_index, null_index, info, buffer_size); + array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + + // count nnz + array c_tmp_row_ptrs_array(exec, m + 1); + auto c_tmp_row_ptrs = c_tmp_row_ptrs_array.get_data(); + IndexType c_nnz{}; + sparselib::spgemm_nnz( + handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr, + b_nnz, b_row_ptrs, b_col_idxs, d_descr, IndexType{}, null_index, + null_index, c_descr, c_tmp_row_ptrs, &c_nnz, info, buffer); + + // accumulate non-zeros for A * B + array c_tmp_col_idxs_array(exec, c_nnz); + array c_tmp_vals_array(exec, c_nnz); + auto c_tmp_col_idxs = c_tmp_col_idxs_array.get_data(); + auto c_tmp_vals = c_tmp_vals_array.get_data(); + sparselib::spgemm(handle, m, n, k, &one_value, a_descr, a_nnz, a_vals, + a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, + b_row_ptrs, b_col_idxs, null_value, d_descr, + IndexType{}, null_value, null_index, null_index, + c_descr, c_tmp_vals, c_tmp_row_ptrs, c_tmp_col_idxs, + info, buffer); + + // destroy hipsparse context + sparselib::destroy_spgemm_info(info); + sparselib::destroy(d_descr); + sparselib::destroy(c_descr); + sparselib::destroy(b_descr); + sparselib::destroy(a_descr); + + auto total_nnz = c_nnz + d->get_num_stored_elements(); + auto nnz_per_row = total_nnz / m; + select_spgeam( + spgeam_kernels(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= nnz_per_row || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, + alpha->get_const_values(), c_tmp_row_ptrs, c_tmp_col_idxs, + c_tmp_vals, beta->get_const_values(), d_row_ptrs, d_col_idxs, + d_vals, c); + } else { + GKO_NOT_IMPLEMENTED; + } +#else // GKO_COMPILING_CUDA + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); + + auto valpha = exec->copy_val_to_host(alpha->get_const_values()); + auto a_nnz = IndexType(a->get_num_stored_elements()); + auto a_vals = a->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto b_nnz = IndexType(b->get_num_stored_elements()); + auto b_vals = b->get_const_values(); + auto b_row_ptrs = b->get_const_row_ptrs(); + auto b_col_idxs = b->get_const_col_idxs(); + auto vbeta = exec->copy_val_to_host(beta->get_const_values()); + auto d_nnz = IndexType(d->get_num_stored_elements()); + auto d_vals = d->get_const_values(); + auto d_row_ptrs = d->get_const_row_ptrs(); + auto d_col_idxs = d->get_const_col_idxs(); + auto m = IndexType(a->get_size()[0]); + auto n = IndexType(b->get_size()[1]); + auto k = IndexType(a->get_size()[1]); + auto c_row_ptrs = c->get_row_ptrs(); + + auto null_value = static_cast(nullptr); + auto null_index = static_cast(nullptr); + auto one_val = one(); + auto zero_val = zero(); + auto zero_nnz = IndexType{}; + auto spgemm_descr = sparselib::create_spgemm_descr(); + auto a_descr = sparselib::create_csr( + m, k, a_nnz, const_cast(a_row_ptrs), + const_cast(a_col_idxs), const_cast(a_vals)); + auto b_descr = sparselib::create_csr( + k, n, b_nnz, const_cast(b_row_ptrs), + const_cast(b_col_idxs), const_cast(b_vals)); + auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index, + null_value); + + // estimate work + size_type buffer1_size{}; + sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr, + &zero_val, c_descr, spgemm_descr, + buffer1_size, nullptr); + array buffer1{exec, buffer1_size}; + sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr, + &zero_val, c_descr, spgemm_descr, + buffer1_size, buffer1.get_data()); + + // compute spgemm + size_type buffer2_size{}; + sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val, + c_descr, spgemm_descr, buffer1.get_data(), + buffer2_size, nullptr); + array buffer2{exec, buffer2_size}; + sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val, + c_descr, spgemm_descr, buffer1.get_data(), + buffer2_size, buffer2.get_data()); + + // write result to temporary storage + auto c_tmp_nnz = sparselib::sparse_matrix_nnz(c_descr); + array c_tmp_row_ptrs_array(exec, m + 1); + array c_tmp_col_idxs_array(exec, c_tmp_nnz); + array c_tmp_vals_array(exec, c_tmp_nnz); + sparselib::csr_set_pointers(c_descr, c_tmp_row_ptrs_array.get_data(), + c_tmp_col_idxs_array.get_data(), + c_tmp_vals_array.get_data()); + + sparselib::spgemm_copy(handle, &one_val, a_descr, b_descr, &zero_val, + c_descr, spgemm_descr); + + sparselib::destroy(c_descr); + sparselib::destroy(b_descr); + sparselib::destroy(a_descr); + sparselib::destroy(spgemm_descr); + + auto spgeam_total_nnz = c_tmp_nnz + d->get_num_stored_elements(); + auto nnz_per_row = spgeam_total_nnz / m; + select_spgeam( + spgeam_kernels(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= nnz_per_row || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, + alpha->get_const_values(), c_tmp_row_ptrs_array.get_const_data(), + c_tmp_col_idxs_array.get_const_data(), + c_tmp_vals_array.get_const_data(), beta->get_const_values(), d_row_ptrs, + d_col_idxs, d_vals, c); +#endif // GKO_COMPILING_CUDA +} + + +template +void transpose(std::shared_ptr exec, + const matrix::Csr* orig, + matrix::Csr* trans) +{ + if (orig->get_size()[0] == 0) { + return; + } + if (sparselib::is_supported::value) { +#ifdef GKO_COMPILING_HIP + hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC; + hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO; + + sparselib::transpose( + exec->get_sparselib_handle(), orig->get_size()[0], + orig->get_size()[1], orig->get_num_stored_elements(), + orig->get_const_values(), orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), trans->get_values(), + trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase); +#else // GKO_COMPILING_CUDA + cudaDataType_t cu_value = + gko::kernels::cuda::cuda_data_type(); + cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; + cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; + cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG1; + size_type buffer_size = 0; + sparselib::transpose_buffersize( + exec->get_sparselib_handle(), orig->get_size()[0], + orig->get_size()[1], orig->get_num_stored_elements(), + orig->get_const_values(), orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), trans->get_values(), + trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues, + idxBase, alg, &buffer_size); + array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + sparselib::transpose( + exec->get_sparselib_handle(), orig->get_size()[0], + orig->get_size()[1], orig->get_num_stored_elements(), + orig->get_const_values(), orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), trans->get_values(), + trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues, + idxBase, alg, buffer); +#endif // GKO_COMPILING_CUDA + } else { + fallback_transpose(exec, orig, trans); + } +} + + +template +void conj_transpose(std::shared_ptr exec, + const matrix::Csr* orig, + matrix::Csr* trans) +{ + if (orig->get_size()[0] == 0) { + return; + } + const auto block_size = default_block_size; + const auto grid_size = + ceildiv(trans->get_num_stored_elements(), block_size); + transpose(exec, orig, trans); + if (grid_size > 0 && is_complex()) { + kernel::conjugate<<get_stream()>>>( + trans->get_num_stored_elements(), + as_device_type(trans->get_values())); + } +} + + +template +void sort_by_column_index(std::shared_ptr exec, + matrix::Csr* to_sort) +{ + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + auto descr = sparselib::create_mat_descr(); + auto m = IndexType(to_sort->get_size()[0]); + auto n = IndexType(to_sort->get_size()[1]); + auto nnz = IndexType(to_sort->get_num_stored_elements()); + auto row_ptrs = to_sort->get_const_row_ptrs(); + auto col_idxs = to_sort->get_col_idxs(); + auto vals = to_sort->get_values(); + + // copy values + array tmp_vals_array(exec, nnz); + exec->copy(nnz, vals, tmp_vals_array.get_data()); + auto tmp_vals = tmp_vals_array.get_const_data(); + + // init identity permutation + array permutation_array(exec, nnz); + auto permutation = permutation_array.get_data(); + components::fill_seq_array(exec, permutation, nnz); + + // allocate buffer + size_type buffer_size{}; + sparselib::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs, + buffer_size); + array buffer_array{exec, buffer_size}; + auto buffer = buffer_array.get_data(); + + // sort column indices + sparselib::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs, + permutation, buffer); + + // sort values +#ifdef GKO_COMPILING_HIP + sparselib::gather(handle, nnz, tmp_vals, vals, permutation); +#else // GKO_COMPILING_CUDA + auto val_vec = sparselib::create_spvec(nnz, nnz, permutation, vals); + auto tmp_vec = + sparselib::create_dnvec(nnz, const_cast(tmp_vals)); + sparselib::gather(handle, tmp_vec, val_vec); +#endif + + sparselib::destroy(descr); + } else { + fallback_sort(exec, to_sort); + } +} + + +} // namespace csr +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 92b48518e7c..d4a94eda802 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.18 FATAL_ERROR) add_library(ginkgo_cuda $ "") include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) -add_instantiation_files(. matrix/csr_kernels.instantiate.cu CSR_INSTANTIATE) +add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) @@ -41,7 +41,7 @@ else() endif() jacobi_generated_files(GKO_CUDA_JACOBI_SOURCES "${GKO_CUDA_JACOBI_BLOCK_SIZES}") # override the default language mapping for the common files, set them to CUDA -foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES GKO_CUDA_JACOBI_SOURCES FBCSR_INSTANTIATE) +foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES GKO_CUDA_JACOBI_SOURCES CSR_INSTANTIATE FBCSR_INSTANTIATE) set_source_files_properties(${source_file} PROPERTIES LANGUAGE CUDA) endforeach(source_file) target_sources(ginkgo_cuda PRIVATE ${GKO_CUDA_JACOBI_SOURCES}) diff --git a/cuda/matrix/csr_kernels.instantiate.cu b/cuda/matrix/csr_kernels.instantiate.cu deleted file mode 100644 index a24e66ed89d..00000000000 --- a/cuda/matrix/csr_kernels.instantiate.cu +++ /dev/null @@ -1,81 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "cuda/matrix/csr_kernels.template.cu" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Compressed sparse row matrix format namespace. - * - * @ingroup csr - */ -namespace csr { - - -// begin -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); -// split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_SPMV_KERNEL); -// split -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX); -// split -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); -// end - - -} // namespace csr -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu deleted file mode 100644 index 89e5de9c303..00000000000 --- a/cuda/matrix/csr_kernels.template.cu +++ /dev/null @@ -1,1002 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/csr_kernels.hpp" - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "accessor/cuda_hip_helper.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/pointer_mode_guard.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/sparselib_bindings.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/atomic.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/format_conversion.hpp" -#include "common/cuda_hip/components/intrinsics.hpp" -#include "common/cuda_hip/components/merging.hpp" -#include "common/cuda_hip/components/prefix_sum.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/segment_scan.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "core/base/array_access.hpp" -#include "core/base/mixed_precision_types.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/components/format_conversion_kernels.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/csr_accessor_helper.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_lookup.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -/** - * @brief The Compressed sparse row matrix format namespace. - * - * @ingroup csr - */ -namespace csr { - - -constexpr int default_block_size = 512; -constexpr int warps_in_block = 4; -constexpr int spmv_block_size = warps_in_block * config::warp_size; -constexpr int classical_oversubscription = 32; - - -/** - * A compile-time list of the number items per threads for which spmv kernel - * should be compiled. - */ -using compiled_kernels = syn::value_list; - -using classical_kernels = - syn::value_list; - -using spgeam_kernels = - syn::value_list; - - -#include "common/cuda_hip/matrix/csr_common.hpp.inc" -#include "common/cuda_hip/matrix/csr_kernels.hpp.inc" - - -namespace host_kernel { -namespace { - - -template -void merge_path_spmv(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* a, - const matrix::Dense* b, - matrix::Dense* c, - const matrix::Dense* alpha = nullptr, - const matrix::Dense* beta = nullptr) -{ - using arithmetic_type = - highest_precision; - const IndexType total = a->get_size()[0] + a->get_num_stored_elements(); - const IndexType grid_num = - ceildiv(total, spmv_block_size * items_per_thread); - const auto grid = grid_num; - const auto block = spmv_block_size; - // TODO: workspace? - array row_out(exec, grid_num); - // TODO: should we store the value in arithmetic_type or output_type? - array val_out(exec, grid_num); - - const auto a_vals = - acc::helper::build_const_rrm_accessor(a); - - for (IndexType column_id = 0; column_id < b->get_size()[1]; column_id++) { - const auto column_span = - acc::index_span(static_cast(column_id), - static_cast(column_id + 1)); - const auto b_vals = - acc::helper::build_const_rrm_accessor(b, - column_span); - auto c_vals = - acc::helper::build_rrm_accessor(c, column_span); - if (alpha == nullptr && beta == nullptr) { - if (grid_num > 0) { - kernel::abstract_merge_path_spmv - <<get_stream()>>>( - static_cast(a->get_size()[0]), - acc::as_device_range(a_vals), a->get_const_col_idxs(), - as_device_type(a->get_const_row_ptrs()), - as_device_type(a->get_const_srow()), - acc::as_device_range(b_vals), - acc::as_device_range(c_vals), - as_device_type(row_out.get_data()), - as_device_type(val_out.get_data())); - } - kernel:: - abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>( - grid_num, as_device_type(val_out.get_data()), - as_device_type(row_out.get_data()), - acc::as_device_range(c_vals)); - - } else if (alpha != nullptr && beta != nullptr) { - if (grid_num > 0) { - kernel::abstract_merge_path_spmv - <<get_stream()>>>( - static_cast(a->get_size()[0]), - as_device_type(alpha->get_const_values()), - acc::as_device_range(a_vals), a->get_const_col_idxs(), - as_device_type(a->get_const_row_ptrs()), - as_device_type(a->get_const_srow()), - acc::as_device_range(b_vals), - as_device_type(beta->get_const_values()), - acc::as_device_range(c_vals), - as_device_type(row_out.get_data()), - as_device_type(val_out.get_data())); - } - kernel:: - abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>( - grid_num, as_device_type(val_out.get_data()), - as_device_type(row_out.get_data()), - as_device_type(alpha->get_const_values()), - acc::as_device_range(c_vals)); - } else { - GKO_KERNEL_NOT_FOUND; - } - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_merge_path_spmv, merge_path_spmv); - - -template -int compute_items_per_thread(std::shared_ptr exec) -{ - const int version = - (exec->get_major_version() << 4) + exec->get_minor_version(); - // The num_item is decided to make the occupancy 100% - // TODO: Extend this list when new GPU is released - // Tune this parameter - // 128 threads/block the number of items per threads - // 3.0 3.5: 6 - // 3.7: 14 - // 5.0, 5.3, 6.0, 6.2: 8 - // 5.2, 6.1, 7.0: 12 - int num_item = 6; - switch (version) { - case 0x50: - case 0x53: - case 0x60: - case 0x62: - num_item = 8; - break; - case 0x52: - case 0x61: - case 0x70: - num_item = 12; - break; - case 0x37: - num_item = 14; - } - // Ensure that the following is satisfied: - // sizeof(IndexType) + sizeof(ValueType) - // <= items_per_thread * sizeof(IndexType) - constexpr int minimal_num = - ceildiv(sizeof(IndexType) + sizeof(ValueType), sizeof(IndexType)); - int items_per_thread = num_item * 4 / sizeof(IndexType); - return std::max(minimal_num, items_per_thread); -} - - -template -void classical_spmv(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* a, - const matrix::Dense* b, - matrix::Dense* c, - const matrix::Dense* alpha = nullptr, - const matrix::Dense* beta = nullptr) -{ - using arithmetic_type = - highest_precision; - - const auto nwarps = exec->get_num_warps_per_sm() * - exec->get_num_multiprocessor() * - classical_oversubscription; - const auto gridx = - std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size), - int64(nwarps / warps_in_block)); - const dim3 grid(gridx, b->get_size()[1]); - const auto block = spmv_block_size; - - const auto a_vals = - acc::helper::build_const_rrm_accessor(a); - const auto b_vals = - acc::helper::build_const_rrm_accessor(b); - auto c_vals = acc::helper::build_rrm_accessor(c); - if (alpha == nullptr && beta == nullptr) { - if (grid.x > 0 && grid.y > 0) { - kernel::abstract_classical_spmv - <<get_stream()>>>( - a->get_size()[0], acc::as_device_range(a_vals), - a->get_const_col_idxs(), - as_device_type(a->get_const_row_ptrs()), - acc::as_device_range(b_vals), acc::as_device_range(c_vals)); - } - } else if (alpha != nullptr && beta != nullptr) { - if (grid.x > 0 && grid.y > 0) { - kernel::abstract_classical_spmv - <<get_stream()>>>( - a->get_size()[0], as_device_type(alpha->get_const_values()), - acc::as_device_range(a_vals), a->get_const_col_idxs(), - as_device_type(a->get_const_row_ptrs()), - acc::as_device_range(b_vals), - as_device_type(beta->get_const_values()), - acc::as_device_range(c_vals)); - } - } else { - GKO_KERNEL_NOT_FOUND; - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv); - - -template -void load_balance_spmv(std::shared_ptr exec, - const matrix::Csr* a, - const matrix::Dense* b, - matrix::Dense* c, - const matrix::Dense* alpha = nullptr, - const matrix::Dense* beta = nullptr) -{ - using arithmetic_type = - highest_precision; - - if (beta) { - dense::scale(exec, beta, c); - } else { - dense::fill(exec, c, zero()); - } - const IndexType nwarps = a->get_num_srow_elements(); - if (nwarps > 0) { - const dim3 csr_block(config::warp_size, warps_in_block, 1); - const dim3 csr_grid(ceildiv(nwarps, warps_in_block), b->get_size()[1]); - const auto a_vals = - acc::helper::build_const_rrm_accessor(a); - const auto b_vals = - acc::helper::build_const_rrm_accessor(b); - auto c_vals = acc::helper::build_rrm_accessor(c); - if (alpha) { - if (csr_grid.x > 0 && csr_grid.y > 0) { - kernel::abstract_spmv<<get_stream()>>>( - nwarps, static_cast(a->get_size()[0]), - as_device_type(alpha->get_const_values()), - acc::as_device_range(a_vals), a->get_const_col_idxs(), - as_device_type(a->get_const_row_ptrs()), - as_device_type(a->get_const_srow()), - acc::as_device_range(b_vals), acc::as_device_range(c_vals)); - } - } else { - if (csr_grid.x > 0 && csr_grid.y > 0) { - kernel::abstract_spmv<<get_stream()>>>( - nwarps, static_cast(a->get_size()[0]), - acc::as_device_range(a_vals), a->get_const_col_idxs(), - as_device_type(a->get_const_row_ptrs()), - as_device_type(a->get_const_srow()), - acc::as_device_range(b_vals), acc::as_device_range(c_vals)); - } - } - } -} - - -template -bool try_general_sparselib_spmv(std::shared_ptr exec, - const ValueType* alpha, - const matrix::Csr* a, - const matrix::Dense* b, - const ValueType* beta, - matrix::Dense* c) -{ - auto handle = exec->get_sparselib_handle(); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - if (!sparselib::is_supported::value || - b->get_stride() != 1 || c->get_stride() != 1 || b->get_size()[0] == 0 || - c->get_size()[0] == 0) { - return false; - } - - auto descr = sparselib::create_mat_descr(); - auto row_ptrs = a->get_const_row_ptrs(); - auto col_idxs = a->get_const_col_idxs(); - sparselib::spmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, a->get_size()[0], - a->get_size()[1], a->get_num_stored_elements(), alpha, - descr, a->get_const_values(), row_ptrs, col_idxs, - b->get_const_values(), beta, c->get_values()); - - sparselib::destroy(descr); -#else // CUDA_VERSION >= 11000 - // workaround for a division by zero in cuSPARSE 11.? - if (a->get_size()[1] == 0) { - return false; - } - cusparseOperation_t trans = SPARSELIB_OPERATION_NON_TRANSPOSE; - auto row_ptrs = const_cast(a->get_const_row_ptrs()); - auto col_idxs = const_cast(a->get_const_col_idxs()); - auto values = const_cast(a->get_const_values()); - auto mat = sparselib::create_csr(a->get_size()[0], a->get_size()[1], - a->get_num_stored_elements(), row_ptrs, - col_idxs, values); - auto b_val = const_cast(b->get_const_values()); - auto c_val = c->get_values(); - if (b->get_stride() == 1 && c->get_stride() == 1) { - auto vecb = sparselib::create_dnvec(b->get_size()[0], b_val); - auto vecc = sparselib::create_dnvec(c->get_size()[0], c_val); -#if CUDA_VERSION >= 11021 - constexpr auto alg = CUSPARSE_SPMV_CSR_ALG1; -#else - constexpr auto alg = CUSPARSE_CSRMV_ALG1; -#endif - size_type buffer_size = 0; - sparselib::spmv_buffersize(handle, trans, alpha, mat, vecb, - beta, vecc, alg, &buffer_size); - - array buffer_array(exec, buffer_size); - auto buffer = buffer_array.get_data(); - sparselib::spmv(handle, trans, alpha, mat, vecb, beta, vecc, - alg, buffer); - sparselib::destroy(vecb); - sparselib::destroy(vecc); - } else { -#if CUDA_VERSION >= 11060 - if (b->get_size()[1] == 1) { - // cusparseSpMM seems to take the single strided vector as column - // major without considering stride and row major (cuda 11.6) - return false; - } -#endif // CUDA_VERSION >= 11060 - cusparseSpMMAlg_t alg = CUSPARSE_SPMM_CSR_ALG2; - auto vecb = - sparselib::create_dnmat(b->get_size(), b->get_stride(), b_val); - auto vecc = - sparselib::create_dnmat(c->get_size(), c->get_stride(), c_val); - size_type buffer_size = 0; - sparselib::spmm_buffersize(handle, trans, trans, alpha, mat, - vecb, beta, vecc, alg, - &buffer_size); - - array buffer_array(exec, buffer_size); - auto buffer = buffer_array.get_data(); - sparselib::spmm(handle, trans, trans, alpha, mat, vecb, beta, - vecc, alg, buffer); - sparselib::destroy(vecb); - sparselib::destroy(vecc); - } - sparselib::destroy(mat); -#endif - return true; -} - - -template ::value || - !std::is_same::value>> -bool try_sparselib_spmv(std::shared_ptr exec, - const matrix::Csr* a, - const matrix::Dense* b, - matrix::Dense* c, - const matrix::Dense* alpha = nullptr, - const matrix::Dense* beta = nullptr) -{ - // TODO: support sparselib mixed - return false; -} - -template -bool try_sparselib_spmv(std::shared_ptr exec, - const matrix::Csr* a, - const matrix::Dense* b, - matrix::Dense* c, - const matrix::Dense* alpha = nullptr, - const matrix::Dense* beta = nullptr) -{ - if (alpha) { - return try_general_sparselib_spmv(exec, alpha->get_const_values(), a, b, - beta->get_const_values(), c); - } else { - auto handle = exec->get_sparselib_handle(); - sparselib::pointer_mode_guard pm_guard(handle); - const auto valpha = one(); - const auto vbeta = zero(); - return try_general_sparselib_spmv(exec, &valpha, a, b, &vbeta, c); - } -} - - -} // anonymous namespace -} // namespace host_kernel - - -template -void spmv(std::shared_ptr exec, - const matrix::Csr* a, - const matrix::Dense* b, - matrix::Dense* c) -{ - if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { - // empty output: nothing to do - } else if (a->get_strategy()->get_name() == "load_balance") { - host_kernel::load_balance_spmv(exec, a, b, c); - } else if (a->get_strategy()->get_name() == "merge_path") { - using arithmetic_type = - highest_precision; - int items_per_thread = - host_kernel::compute_items_per_thread( - exec); - host_kernel::select_merge_path_spmv( - compiled_kernels(), - [&items_per_thread](int compiled_info) { - return items_per_thread == compiled_info; - }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); - } else { - bool use_classical = true; - if (a->get_strategy()->get_name() == "sparselib" || - a->get_strategy()->get_name() == "cusparse") { - use_classical = !host_kernel::try_sparselib_spmv(exec, a, b, c); - } - if (use_classical) { - IndexType max_length_per_row = 0; - using Tcsr = matrix::Csr; - if (auto strategy = - std::dynamic_pointer_cast( - a->get_strategy())) { - max_length_per_row = strategy->get_max_length_per_row(); - } else if (auto strategy = std::dynamic_pointer_cast< - const typename Tcsr::automatical>( - a->get_strategy())) { - max_length_per_row = strategy->get_max_length_per_row(); - } else { - // as a fall-back: use average row length, at least 1 - max_length_per_row = a->get_num_stored_elements() / - std::max(a->get_size()[0], 1); - } - max_length_per_row = std::max(max_length_per_row, 1); - host_kernel::select_classical_spmv( - classical_kernels(), - [&max_length_per_row](int compiled_info) { - return max_length_per_row >= compiled_info; - }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); - } - } -} - - -template -void advanced_spmv(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Csr* a, - const matrix::Dense* b, - const matrix::Dense* beta, - matrix::Dense* c) -{ - if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { - // empty output: nothing to do - } else if (a->get_strategy()->get_name() == "load_balance") { - host_kernel::load_balance_spmv(exec, a, b, c, alpha, beta); - } else if (a->get_strategy()->get_name() == "merge_path") { - using arithmetic_type = - highest_precision; - int items_per_thread = - host_kernel::compute_items_per_thread( - exec); - host_kernel::select_merge_path_spmv( - compiled_kernels(), - [&items_per_thread](int compiled_info) { - return items_per_thread == compiled_info; - }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, - beta); - } else { - bool use_classical = true; - if (a->get_strategy()->get_name() == "sparselib" || - a->get_strategy()->get_name() == "cusparse") { - use_classical = - !host_kernel::try_sparselib_spmv(exec, a, b, c, alpha, beta); - } - if (use_classical) { - IndexType max_length_per_row = 0; - using Tcsr = matrix::Csr; - if (auto strategy = - std::dynamic_pointer_cast( - a->get_strategy())) { - max_length_per_row = strategy->get_max_length_per_row(); - } else if (auto strategy = std::dynamic_pointer_cast< - const typename Tcsr::automatical>( - a->get_strategy())) { - max_length_per_row = strategy->get_max_length_per_row(); - } else { - // as a fall-back: use average row length, at least 1 - max_length_per_row = a->get_num_stored_elements() / - std::max(a->get_size()[0], 1); - } - max_length_per_row = std::max(max_length_per_row, 1); - host_kernel::select_classical_spmv( - classical_kernels(), - [&max_length_per_row](int compiled_info) { - return max_length_per_row >= compiled_info; - }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, - alpha, beta); - } - } -} - - -template -void spgemm(std::shared_ptr exec, - const matrix::Csr* a, - const matrix::Csr* b, - matrix::Csr* c) -{ - auto a_vals = a->get_const_values(); - auto a_row_ptrs = a->get_const_row_ptrs(); - auto a_col_idxs = a->get_const_col_idxs(); - auto b_vals = b->get_const_values(); - auto b_row_ptrs = b->get_const_row_ptrs(); - auto b_col_idxs = b->get_const_col_idxs(); - auto c_row_ptrs = c->get_row_ptrs(); - - auto handle = exec->get_sparselib_handle(); - sparselib::pointer_mode_guard pm_guard(handle); - - auto alpha = one(); - auto a_nnz = static_cast(a->get_num_stored_elements()); - auto b_nnz = static_cast(b->get_num_stored_elements()); - auto null_value = static_cast(nullptr); - auto null_index = static_cast(nullptr); - auto zero_nnz = IndexType{}; - auto m = IndexType(a->get_size()[0]); - auto n = IndexType(b->get_size()[1]); - auto k = IndexType(a->get_size()[1]); - matrix::CsrBuilder c_builder{c}; - auto& c_col_idxs_array = c_builder.get_col_idx_array(); - auto& c_vals_array = c_builder.get_value_array(); - -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - if (!sparselib::is_supported::value) { - GKO_NOT_IMPLEMENTED; - } - - auto a_descr = sparselib::create_mat_descr(); - auto b_descr = sparselib::create_mat_descr(); - auto c_descr = sparselib::create_mat_descr(); - auto d_descr = sparselib::create_mat_descr(); - auto info = sparselib::create_spgemm_info(); - // allocate buffer - size_type buffer_size{}; - sparselib::spgemm_buffer_size( - handle, m, n, k, &alpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs, - b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz, - null_index, null_index, info, buffer_size); - array buffer_array(exec, buffer_size); - auto buffer = buffer_array.get_data(); - - // count nnz - IndexType c_nnz{}; - sparselib::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs, - a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, - d_descr, zero_nnz, null_index, null_index, c_descr, - c_row_ptrs, &c_nnz, info, buffer); - - // accumulate non-zeros - c_col_idxs_array.resize_and_reset(c_nnz); - c_vals_array.resize_and_reset(c_nnz); - auto c_col_idxs = c_col_idxs_array.get_data(); - auto c_vals = c_vals_array.get_data(); - sparselib::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals, - a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, - b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz, - null_value, null_index, null_index, c_descr, c_vals, - c_row_ptrs, c_col_idxs, info, buffer); - - sparselib::destroy(info); - sparselib::destroy(d_descr); - sparselib::destroy(c_descr); - sparselib::destroy(b_descr); - sparselib::destroy(a_descr); - -#else // CUDA_VERSION >= 11000 - const auto beta = zero(); - auto spgemm_descr = sparselib::create_spgemm_descr(); - auto a_descr = sparselib::create_csr( - m, k, a_nnz, const_cast(a_row_ptrs), - const_cast(a_col_idxs), const_cast(a_vals)); - auto b_descr = sparselib::create_csr( - k, n, b_nnz, const_cast(b_row_ptrs), - const_cast(b_col_idxs), const_cast(b_vals)); - auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index, - null_value); - - // estimate work - size_type buffer1_size{}; - sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta, - c_descr, spgemm_descr, buffer1_size, - nullptr); - array buffer1{exec, buffer1_size}; - sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta, - c_descr, spgemm_descr, buffer1_size, - buffer1.get_data()); - - // compute spgemm - size_type buffer2_size{}; - sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr, - spgemm_descr, buffer1.get_data(), buffer2_size, - nullptr); - array buffer2{exec, buffer2_size}; - sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr, - spgemm_descr, buffer1.get_data(), buffer2_size, - buffer2.get_data()); - - // copy data to result - auto c_nnz = sparselib::sparse_matrix_nnz(c_descr); - c_col_idxs_array.resize_and_reset(c_nnz); - c_vals_array.resize_and_reset(c_nnz); - sparselib::csr_set_pointers(c_descr, c_row_ptrs, - c_col_idxs_array.get_data(), - c_vals_array.get_data()); - - sparselib::spgemm_copy(handle, &alpha, a_descr, b_descr, &beta, c_descr, - spgemm_descr); - - sparselib::destroy(c_descr); - sparselib::destroy(b_descr); - sparselib::destroy(a_descr); - sparselib::destroy(spgemm_descr); -#endif // CUDA_VERSION >= 11000 -} - - -template -void advanced_spgemm(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Csr* a, - const matrix::Csr* b, - const matrix::Dense* beta, - const matrix::Csr* d, - matrix::Csr* c) -{ - auto handle = exec->get_sparselib_handle(); - sparselib::pointer_mode_guard pm_guard(handle); - - auto valpha = exec->copy_val_to_host(alpha->get_const_values()); - auto a_nnz = IndexType(a->get_num_stored_elements()); - auto a_vals = a->get_const_values(); - auto a_row_ptrs = a->get_const_row_ptrs(); - auto a_col_idxs = a->get_const_col_idxs(); - auto b_nnz = IndexType(b->get_num_stored_elements()); - auto b_vals = b->get_const_values(); - auto b_row_ptrs = b->get_const_row_ptrs(); - auto b_col_idxs = b->get_const_col_idxs(); - auto vbeta = exec->copy_val_to_host(beta->get_const_values()); - auto d_nnz = IndexType(d->get_num_stored_elements()); - auto d_vals = d->get_const_values(); - auto d_row_ptrs = d->get_const_row_ptrs(); - auto d_col_idxs = d->get_const_col_idxs(); - auto m = IndexType(a->get_size()[0]); - auto n = IndexType(b->get_size()[1]); - auto k = IndexType(a->get_size()[1]); - auto c_row_ptrs = c->get_row_ptrs(); - -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - if (!sparselib::is_supported::value) { - GKO_NOT_IMPLEMENTED; - } - - matrix::CsrBuilder c_builder{c}; - auto& c_col_idxs_array = c_builder.get_col_idx_array(); - auto& c_vals_array = c_builder.get_value_array(); - auto a_descr = sparselib::create_mat_descr(); - auto b_descr = sparselib::create_mat_descr(); - auto c_descr = sparselib::create_mat_descr(); - auto d_descr = sparselib::create_mat_descr(); - auto info = sparselib::create_spgemm_info(); - // allocate buffer - size_type buffer_size{}; - sparselib::spgemm_buffer_size( - handle, m, n, k, &valpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs, - b_descr, b_nnz, b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz, - d_row_ptrs, d_col_idxs, info, buffer_size); - array buffer_array(exec, buffer_size); - auto buffer = buffer_array.get_data(); - - // count nnz - IndexType c_nnz{}; - sparselib::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs, - a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, - d_descr, d_nnz, d_row_ptrs, d_col_idxs, c_descr, - c_row_ptrs, &c_nnz, info, buffer); - - // accumulate non-zeros - c_col_idxs_array.resize_and_reset(c_nnz); - c_vals_array.resize_and_reset(c_nnz); - auto c_col_idxs = c_col_idxs_array.get_data(); - auto c_vals = c_vals_array.get_data(); - sparselib::spgemm(handle, m, n, k, &valpha, a_descr, a_nnz, a_vals, - a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, - b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz, d_vals, - d_row_ptrs, d_col_idxs, c_descr, c_vals, c_row_ptrs, - c_col_idxs, info, buffer); - - sparselib::destroy(info); - sparselib::destroy(d_descr); - sparselib::destroy(c_descr); - sparselib::destroy(b_descr); - sparselib::destroy(a_descr); -#else // CUDA_VERSION >= 11000 - auto null_value = static_cast(nullptr); - auto null_index = static_cast(nullptr); - auto one_val = one(); - auto zero_val = zero(); - auto zero_nnz = IndexType{}; - auto spgemm_descr = sparselib::create_spgemm_descr(); - auto a_descr = sparselib::create_csr( - m, k, a_nnz, const_cast(a_row_ptrs), - const_cast(a_col_idxs), const_cast(a_vals)); - auto b_descr = sparselib::create_csr( - k, n, b_nnz, const_cast(b_row_ptrs), - const_cast(b_col_idxs), const_cast(b_vals)); - auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index, - null_value); - - // estimate work - size_type buffer1_size{}; - sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr, - &zero_val, c_descr, spgemm_descr, - buffer1_size, nullptr); - array buffer1{exec, buffer1_size}; - sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr, - &zero_val, c_descr, spgemm_descr, - buffer1_size, buffer1.get_data()); - - // compute spgemm - size_type buffer2_size{}; - sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val, - c_descr, spgemm_descr, buffer1.get_data(), - buffer2_size, nullptr); - array buffer2{exec, buffer2_size}; - sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val, - c_descr, spgemm_descr, buffer1.get_data(), - buffer2_size, buffer2.get_data()); - - // write result to temporary storage - auto c_tmp_nnz = sparselib::sparse_matrix_nnz(c_descr); - array c_tmp_row_ptrs_array(exec, m + 1); - array c_tmp_col_idxs_array(exec, c_tmp_nnz); - array c_tmp_vals_array(exec, c_tmp_nnz); - sparselib::csr_set_pointers(c_descr, c_tmp_row_ptrs_array.get_data(), - c_tmp_col_idxs_array.get_data(), - c_tmp_vals_array.get_data()); - - sparselib::spgemm_copy(handle, &one_val, a_descr, b_descr, &zero_val, - c_descr, spgemm_descr); - - sparselib::destroy(c_descr); - sparselib::destroy(b_descr); - sparselib::destroy(a_descr); - sparselib::destroy(spgemm_descr); - - auto spgeam_total_nnz = c_tmp_nnz + d->get_num_stored_elements(); - auto nnz_per_row = spgeam_total_nnz / m; - select_spgeam( - spgeam_kernels(), - [&](int compiled_subwarp_size) { - return compiled_subwarp_size >= nnz_per_row || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, - alpha->get_const_values(), c_tmp_row_ptrs_array.get_const_data(), - c_tmp_col_idxs_array.get_const_data(), - c_tmp_vals_array.get_const_data(), beta->get_const_values(), d_row_ptrs, - d_col_idxs, d_vals, c); -#endif // CUDA_VERSION >= 11000 -} - - -template -void transpose(std::shared_ptr exec, - const matrix::Csr* orig, - matrix::Csr* trans) -{ - if (orig->get_size()[0] == 0) { - return; - } - if (sparselib::is_supported::value) { -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; - cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; - - sparselib::transpose( - exec->get_sparselib_handle(), orig->get_size()[0], - orig->get_size()[1], orig->get_num_stored_elements(), - orig->get_const_values(), orig->get_const_row_ptrs(), - orig->get_const_col_idxs(), trans->get_values(), - trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase); -#else // CUDA_VERSION >= 11000 - cudaDataType_t cu_value = - gko::kernels::cuda::cuda_data_type(); - cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; - cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; - cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG1; - size_type buffer_size = 0; - sparselib::transpose_buffersize( - exec->get_sparselib_handle(), orig->get_size()[0], - orig->get_size()[1], orig->get_num_stored_elements(), - orig->get_const_values(), orig->get_const_row_ptrs(), - orig->get_const_col_idxs(), trans->get_values(), - trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues, - idxBase, alg, &buffer_size); - array buffer_array(exec, buffer_size); - auto buffer = buffer_array.get_data(); - sparselib::transpose( - exec->get_sparselib_handle(), orig->get_size()[0], - orig->get_size()[1], orig->get_num_stored_elements(), - orig->get_const_values(), orig->get_const_row_ptrs(), - orig->get_const_col_idxs(), trans->get_values(), - trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues, - idxBase, alg, buffer); -#endif - } else { - fallback_transpose(exec, orig, trans); - } -} - - -template -void conj_transpose(std::shared_ptr exec, - const matrix::Csr* orig, - matrix::Csr* trans) -{ - if (orig->get_size()[0] == 0) { - return; - } - const auto block_size = default_block_size; - const auto grid_size = - ceildiv(trans->get_num_stored_elements(), block_size); - if (sparselib::is_supported::value) { -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; - cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; - - sparselib::transpose( - exec->get_sparselib_handle(), orig->get_size()[0], - orig->get_size()[1], orig->get_num_stored_elements(), - orig->get_const_values(), orig->get_const_row_ptrs(), - orig->get_const_col_idxs(), trans->get_values(), - trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase); -#else // CUDA_VERSION >= 11000 - cudaDataType_t cu_value = - gko::kernels::cuda::cuda_data_type(); - cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; - cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; - cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG1; - size_type buffer_size = 0; - sparselib::transpose_buffersize( - exec->get_sparselib_handle(), orig->get_size()[0], - orig->get_size()[1], orig->get_num_stored_elements(), - orig->get_const_values(), orig->get_const_row_ptrs(), - orig->get_const_col_idxs(), trans->get_values(), - trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues, - idxBase, alg, &buffer_size); - array buffer_array(exec, buffer_size); - auto buffer = buffer_array.get_data(); - sparselib::transpose( - exec->get_sparselib_handle(), orig->get_size()[0], - orig->get_size()[1], orig->get_num_stored_elements(), - orig->get_const_values(), orig->get_const_row_ptrs(), - orig->get_const_col_idxs(), trans->get_values(), - trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues, - idxBase, alg, buffer); -#endif - } else { - fallback_transpose(exec, orig, trans); - } - if (grid_size > 0 && is_complex()) { - kernel::conjugate<<get_stream()>>>( - trans->get_num_stored_elements(), - as_device_type(trans->get_values())); - } -} - - -template -void sort_by_column_index(std::shared_ptr exec, - matrix::Csr* to_sort) -{ - if (sparselib::is_supported::value) { - auto handle = exec->get_sparselib_handle(); - auto descr = sparselib::create_mat_descr(); - auto m = IndexType(to_sort->get_size()[0]); - auto n = IndexType(to_sort->get_size()[1]); - auto nnz = IndexType(to_sort->get_num_stored_elements()); - auto row_ptrs = to_sort->get_const_row_ptrs(); - auto col_idxs = to_sort->get_col_idxs(); - auto vals = to_sort->get_values(); - - // copy values - array tmp_vals_array(exec, nnz); - exec->copy(nnz, vals, tmp_vals_array.get_data()); - auto tmp_vals = tmp_vals_array.get_const_data(); - - // init identity permutation - array permutation_array(exec, nnz); - auto permutation = permutation_array.get_data(); - components::fill_seq_array(exec, permutation, nnz); - - // allocate buffer - size_type buffer_size{}; - sparselib::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs, - buffer_size); - array buffer_array{exec, buffer_size}; - auto buffer = buffer_array.get_data(); - - // sort column indices - sparselib::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs, - permutation, buffer); - - // sort values -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - sparselib::gather(handle, nnz, tmp_vals, vals, permutation); -#else // CUDA_VERSION >= 11000 - auto val_vec = sparselib::create_spvec(nnz, nnz, permutation, vals); - auto tmp_vec = - sparselib::create_dnvec(nnz, const_cast(tmp_vals)); - sparselib::gather(handle, tmp_vec, val_vec); -#endif - - sparselib::destroy(descr); - } else { - fallback_sort(exec, to_sort); - } -} - - -} // namespace csr -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 67617169b5a..30e675509d5 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.21) include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) -add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_INSTANTIATE) +add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp deleted file mode 100644 index 1fb086c5ea6..00000000000 --- a/hip/matrix/csr_kernels.template.hip.cpp +++ /dev/null @@ -1,798 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/csr_kernels.hpp" - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "accessor/cuda_hip_helper.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/pointer_mode_guard.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/sparselib_bindings.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/atomic.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/format_conversion.hpp" -#include "common/cuda_hip/components/intrinsics.hpp" -#include "common/cuda_hip/components/merging.hpp" -#include "common/cuda_hip/components/prefix_sum.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/segment_scan.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "core/base/array_access.hpp" -#include "core/base/mixed_precision_types.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/components/format_conversion_kernels.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/csr_accessor_helper.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_lookup.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -/** - * @brief The Compressed sparse row matrix format namespace. - * - * @ingroup csr - */ -namespace csr { - - -constexpr int default_block_size = 512; -constexpr int warps_in_block = 4; -constexpr int spmv_block_size = warps_in_block * config::warp_size; -constexpr int classical_oversubscription = 32; - - -/** - * A compile-time list of the number items per threads for which spmv kernel - * should be compiled. - */ -using compiled_kernels = syn::value_list; - -using classical_kernels = - syn::value_list; - -using spgeam_kernels = - syn::value_list; - - -#include "common/cuda_hip/matrix/csr_common.hpp.inc" -#include "common/cuda_hip/matrix/csr_kernels.hpp.inc" - - -namespace host_kernel { -namespace { - - -template -void merge_path_spmv(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* a, - const matrix::Dense* b, - matrix::Dense* c, - const matrix::Dense* alpha = nullptr, - const matrix::Dense* beta = nullptr) -{ - using arithmetic_type = - highest_precision; - const IndexType total = a->get_size()[0] + a->get_num_stored_elements(); - const IndexType grid_num = - ceildiv(total, spmv_block_size * items_per_thread); - const auto grid = grid_num; - const auto block = spmv_block_size; - // TODO: workspace? - array row_out(exec, grid_num); - // TODO: should we store the value in arithmetic_type or output_type? - array val_out(exec, grid_num); - - const auto a_vals = - acc::helper::build_const_rrm_accessor(a); - - for (IndexType column_id = 0; column_id < b->get_size()[1]; column_id++) { - const auto column_span = - acc::index_span(static_cast(column_id), - static_cast(column_id + 1)); - const auto b_vals = - acc::helper::build_const_rrm_accessor(b, - column_span); - auto c_vals = - acc::helper::build_rrm_accessor(c, column_span); - if (alpha == nullptr && beta == nullptr) { - if (grid_num > 0) { - kernel::abstract_merge_path_spmv - <<get_stream()>>>( - static_cast(a->get_size()[0]), - acc::as_device_range(a_vals), a->get_const_col_idxs(), - as_device_type(a->get_const_row_ptrs()), - as_device_type(a->get_const_srow()), - acc::as_device_range(b_vals), - acc::as_device_range(c_vals), - as_device_type(row_out.get_data()), - as_device_type(val_out.get_data())); - } - kernel:: - abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>( - grid_num, as_device_type(val_out.get_data()), - as_device_type(row_out.get_data()), - acc::as_device_range(c_vals)); - - } else if (alpha != nullptr && beta != nullptr) { - if (grid_num > 0) { - kernel::abstract_merge_path_spmv - <<get_stream()>>>( - static_cast(a->get_size()[0]), - as_device_type(alpha->get_const_values()), - acc::as_device_range(a_vals), a->get_const_col_idxs(), - as_device_type(a->get_const_row_ptrs()), - as_device_type(a->get_const_srow()), - acc::as_device_range(b_vals), - as_device_type(beta->get_const_values()), - acc::as_device_range(c_vals), - as_device_type(row_out.get_data()), - as_device_type(val_out.get_data())); - } - kernel:: - abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>( - grid_num, as_device_type(val_out.get_data()), - as_device_type(row_out.get_data()), - as_device_type(alpha->get_const_values()), - acc::as_device_range(c_vals)); - } else { - GKO_KERNEL_NOT_FOUND; - } - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_merge_path_spmv, merge_path_spmv); - - -template -int compute_items_per_thread(std::shared_ptr exec) -{ -#if GINKGO_HIP_PLATFORM_NVCC - - - const int version = - (exec->get_major_version() << 4) + exec->get_minor_version(); - // The num_item is decided to make the occupancy 100% - // TODO: Extend this list when new GPU is released - // Tune this parameter - // 128 threads/block the number of items per threads - // 3.0 3.5: 6 - // 3.7: 14 - // 5.0, 5.3, 6.0, 6.2: 8 - // 5.2, 6.1, 7.0: 12 - int num_item = 6; - switch (version) { - case 0x50: - case 0x53: - case 0x60: - case 0x62: - num_item = 8; - break; - case 0x52: - case 0x61: - case 0x70: - num_item = 12; - break; - case 0x37: - num_item = 14; - } - - -#else - - - // HIP uses the minimal num_item to make the code work correctly. - // TODO: this parameter should be tuned. - int num_item = 6; - - -#endif // GINKGO_HIP_PLATFORM_NVCC - - - // Ensure that the following is satisfied: - // sizeof(IndexType) + sizeof(ValueType) - // <= items_per_thread * sizeof(IndexType) - constexpr int minimal_num = - ceildiv(sizeof(IndexType) + sizeof(ValueType), sizeof(IndexType)); - int items_per_thread = num_item * 4 / sizeof(IndexType); - return std::max(minimal_num, items_per_thread); -} - - -template -void classical_spmv(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* a, - const matrix::Dense* b, - matrix::Dense* c, - const matrix::Dense* alpha = nullptr, - const matrix::Dense* beta = nullptr) -{ - using arithmetic_type = - highest_precision; - - const auto nwarps = exec->get_num_warps_per_sm() * - exec->get_num_multiprocessor() * - classical_oversubscription; - const auto gridx = - std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size), - int64(nwarps / warps_in_block)); - const dim3 grid(gridx, b->get_size()[1]); - const auto block = spmv_block_size; - - const auto a_vals = - acc::helper::build_const_rrm_accessor(a); - const auto b_vals = - acc::helper::build_const_rrm_accessor(b); - auto c_vals = acc::helper::build_rrm_accessor(c); - if (alpha == nullptr && beta == nullptr) { - if (grid.x > 0 && grid.y > 0) { - kernel::abstract_classical_spmv - <<get_stream()>>>( - a->get_size()[0], acc::as_device_range(a_vals), - a->get_const_col_idxs(), - as_device_type(a->get_const_row_ptrs()), - acc::as_device_range(b_vals), acc::as_device_range(c_vals)); - } - } else if (alpha != nullptr && beta != nullptr) { - if (grid.x > 0 && grid.y > 0) { - kernel::abstract_classical_spmv - <<get_stream()>>>( - a->get_size()[0], as_device_type(alpha->get_const_values()), - acc::as_device_range(a_vals), a->get_const_col_idxs(), - as_device_type(a->get_const_row_ptrs()), - acc::as_device_range(b_vals), - as_device_type(beta->get_const_values()), - acc::as_device_range(c_vals)); - } - } else { - GKO_KERNEL_NOT_FOUND; - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv); - - -template -void load_balance_spmv(std::shared_ptr exec, - const matrix::Csr* a, - const matrix::Dense* b, - matrix::Dense* c, - const matrix::Dense* alpha = nullptr, - const matrix::Dense* beta = nullptr) -{ - using arithmetic_type = - highest_precision; - - if (beta) { - dense::scale(exec, beta, c); - } else { - dense::fill(exec, c, zero()); - } - const IndexType nwarps = a->get_num_srow_elements(); - if (nwarps > 0) { - const dim3 csr_block(config::warp_size, warps_in_block, 1); - const dim3 csr_grid(ceildiv(nwarps, warps_in_block), b->get_size()[1]); - const auto a_vals = - acc::helper::build_const_rrm_accessor(a); - const auto b_vals = - acc::helper::build_const_rrm_accessor(b); - auto c_vals = acc::helper::build_rrm_accessor(c); - if (alpha) { - if (csr_grid.x > 0 && csr_grid.y > 0) { - kernel::abstract_spmv<<get_stream()>>>( - nwarps, static_cast(a->get_size()[0]), - as_device_type(alpha->get_const_values()), - acc::as_device_range(a_vals), a->get_const_col_idxs(), - as_device_type(a->get_const_row_ptrs()), - as_device_type(a->get_const_srow()), - acc::as_device_range(b_vals), acc::as_device_range(c_vals)); - } - } else { - if (csr_grid.x > 0 && csr_grid.y > 0) { - kernel::abstract_spmv<<get_stream()>>>( - nwarps, static_cast(a->get_size()[0]), - acc::as_device_range(a_vals), a->get_const_col_idxs(), - as_device_type(a->get_const_row_ptrs()), - as_device_type(a->get_const_srow()), - acc::as_device_range(b_vals), acc::as_device_range(c_vals)); - } - } - } -} - - -template -bool try_general_sparselib_spmv(std::shared_ptr exec, - const ValueType* alpha, - const matrix::Csr* a, - const matrix::Dense* b, - const ValueType* beta, - matrix::Dense* c) -{ - bool try_sparselib = sparselib::is_supported::value; - try_sparselib = - try_sparselib && b->get_stride() == 1 && c->get_stride() == 1; - // rocSPARSE has issues with zero matrices - try_sparselib = try_sparselib && a->get_num_stored_elements() > 0; - if (try_sparselib) { - auto descr = sparselib::create_mat_descr(); - - auto row_ptrs = a->get_const_row_ptrs(); - auto col_idxs = a->get_const_col_idxs(); - - sparselib::spmv(exec->get_sparselib_handle(), - SPARSELIB_OPERATION_NON_TRANSPOSE, a->get_size()[0], - a->get_size()[1], a->get_num_stored_elements(), alpha, - descr, a->get_const_values(), row_ptrs, col_idxs, - b->get_const_values(), beta, c->get_values()); - - sparselib::destroy(descr); - } - return try_sparselib; -} - - -template ::value || - !std::is_same::value>> -bool try_sparselib_spmv(std::shared_ptr exec, - const matrix::Csr* a, - const matrix::Dense* b, - matrix::Dense* c, - const matrix::Dense* alpha = nullptr, - const matrix::Dense* beta = nullptr) -{ - // TODO: support sparselib mixed - return false; -} - -template -bool try_sparselib_spmv(std::shared_ptr exec, - const matrix::Csr* a, - const matrix::Dense* b, - matrix::Dense* c, - const matrix::Dense* alpha = nullptr, - const matrix::Dense* beta = nullptr) -{ - if (alpha) { - return try_general_sparselib_spmv(exec, alpha->get_const_values(), a, b, - beta->get_const_values(), c); - } else { - auto handle = exec->get_sparselib_handle(); - sparselib::pointer_mode_guard pm_guard(handle); - const auto valpha = one(); - const auto vbeta = zero(); - return try_general_sparselib_spmv(exec, &valpha, a, b, &vbeta, c); - } -} - - -} // anonymous namespace -} // namespace host_kernel - - -template -void spmv(std::shared_ptr exec, - const matrix::Csr* a, - const matrix::Dense* b, - matrix::Dense* c) -{ - if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { - // empty output: nothing to do - } else if (a->get_strategy()->get_name() == "load_balance") { - host_kernel::load_balance_spmv(exec, a, b, c); - } else if (a->get_strategy()->get_name() == "merge_path") { - using arithmetic_type = - highest_precision; - int items_per_thread = - host_kernel::compute_items_per_thread( - exec); - host_kernel::select_merge_path_spmv( - compiled_kernels(), - [&items_per_thread](int compiled_info) { - return items_per_thread == compiled_info; - }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); - } else { - bool use_classical = true; - if (a->get_strategy()->get_name() == "sparselib" || - a->get_strategy()->get_name() == "cusparse") { - use_classical = !host_kernel::try_sparselib_spmv(exec, a, b, c); - } - if (use_classical) { - IndexType max_length_per_row = 0; - using Tcsr = matrix::Csr; - if (auto strategy = - std::dynamic_pointer_cast( - a->get_strategy())) { - max_length_per_row = strategy->get_max_length_per_row(); - } else if (auto strategy = std::dynamic_pointer_cast< - const typename Tcsr::automatical>( - a->get_strategy())) { - max_length_per_row = strategy->get_max_length_per_row(); - } else { - // as a fall-back: use average row length, at least 1 - max_length_per_row = a->get_num_stored_elements() / - std::max(a->get_size()[0], 1); - } - max_length_per_row = std::max(max_length_per_row, 1); - host_kernel::select_classical_spmv( - classical_kernels(), - [&max_length_per_row](int compiled_info) { - return max_length_per_row >= compiled_info; - }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); - } - } -} - - -template -void advanced_spmv(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Csr* a, - const matrix::Dense* b, - const matrix::Dense* beta, - matrix::Dense* c) -{ - if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { - // empty output: nothing to do - } else if (a->get_strategy()->get_name() == "load_balance") { - host_kernel::load_balance_spmv(exec, a, b, c, alpha, beta); - } else if (a->get_strategy()->get_name() == "merge_path") { - using arithmetic_type = - highest_precision; - int items_per_thread = - host_kernel::compute_items_per_thread( - exec); - host_kernel::select_merge_path_spmv( - compiled_kernels(), - [&items_per_thread](int compiled_info) { - return items_per_thread == compiled_info; - }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, - beta); - } else { - bool use_classical = true; - if (a->get_strategy()->get_name() == "sparselib" || - a->get_strategy()->get_name() == "cusparse") { - use_classical = - !host_kernel::try_sparselib_spmv(exec, a, b, c, alpha, beta); - } - if (use_classical) { - IndexType max_length_per_row = 0; - using Tcsr = matrix::Csr; - if (auto strategy = - std::dynamic_pointer_cast( - a->get_strategy())) { - max_length_per_row = strategy->get_max_length_per_row(); - } else if (auto strategy = std::dynamic_pointer_cast< - const typename Tcsr::automatical>( - a->get_strategy())) { - max_length_per_row = strategy->get_max_length_per_row(); - } else { - // as a fall-back: use average row length, at least 1 - max_length_per_row = a->get_num_stored_elements() / - std::max(a->get_size()[0], 1); - } - max_length_per_row = std::max(max_length_per_row, 1); - host_kernel::select_classical_spmv( - classical_kernels(), - [&max_length_per_row](int compiled_info) { - return max_length_per_row >= compiled_info; - }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, - alpha, beta); - } - } -} - - -template -void spgemm(std::shared_ptr exec, - const matrix::Csr* a, - const matrix::Csr* b, - matrix::Csr* c) -{ - if (sparselib::is_supported::value) { - auto handle = exec->get_sparselib_handle(); - sparselib::pointer_mode_guard pm_guard(handle); - auto a_descr = sparselib::create_mat_descr(); - auto b_descr = sparselib::create_mat_descr(); - auto c_descr = sparselib::create_mat_descr(); - auto d_descr = sparselib::create_mat_descr(); - auto info = sparselib::create_spgemm_info(); - - auto alpha = one(); - auto a_nnz = static_cast(a->get_num_stored_elements()); - auto a_vals = a->get_const_values(); - auto a_row_ptrs = a->get_const_row_ptrs(); - auto a_col_idxs = a->get_const_col_idxs(); - auto b_nnz = static_cast(b->get_num_stored_elements()); - auto b_vals = b->get_const_values(); - auto b_row_ptrs = b->get_const_row_ptrs(); - auto b_col_idxs = b->get_const_col_idxs(); - auto null_value = static_cast(nullptr); - auto null_index = static_cast(nullptr); - auto zero_nnz = IndexType{}; - auto m = static_cast(a->get_size()[0]); - auto n = static_cast(b->get_size()[1]); - auto k = static_cast(a->get_size()[1]); - auto c_row_ptrs = c->get_row_ptrs(); - matrix::CsrBuilder c_builder{c}; - auto& c_col_idxs_array = c_builder.get_col_idx_array(); - auto& c_vals_array = c_builder.get_value_array(); - - // allocate buffer - size_type buffer_size{}; - sparselib::spgemm_buffer_size( - handle, m, n, k, &alpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs, - b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr, - zero_nnz, null_index, null_index, info, buffer_size); - array buffer_array(exec, buffer_size); - auto buffer = buffer_array.get_data(); - - // count nnz - IndexType c_nnz{}; - sparselib::spgemm_nnz( - handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr, - b_nnz, b_row_ptrs, b_col_idxs, d_descr, zero_nnz, null_index, - null_index, c_descr, c_row_ptrs, &c_nnz, info, buffer); - - // accumulate non-zeros - c_col_idxs_array.resize_and_reset(c_nnz); - c_vals_array.resize_and_reset(c_nnz); - auto c_col_idxs = c_col_idxs_array.get_data(); - auto c_vals = c_vals_array.get_data(); - sparselib::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals, - a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, - b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz, - null_value, null_index, null_index, c_descr, c_vals, - c_row_ptrs, c_col_idxs, info, buffer); - - sparselib::destroy_spgemm_info(info); - sparselib::destroy(d_descr); - sparselib::destroy(c_descr); - sparselib::destroy(b_descr); - sparselib::destroy(a_descr); - } else { - GKO_NOT_IMPLEMENTED; - } -} - - -template -void advanced_spgemm(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Csr* a, - const matrix::Csr* b, - const matrix::Dense* beta, - const matrix::Csr* d, - matrix::Csr* c) -{ - if (sparselib::is_supported::value) { - auto handle = exec->get_sparselib_handle(); - sparselib::pointer_mode_guard pm_guard(handle); - auto a_descr = sparselib::create_mat_descr(); - auto b_descr = sparselib::create_mat_descr(); - auto c_descr = sparselib::create_mat_descr(); - auto d_descr = sparselib::create_mat_descr(); - auto info = sparselib::create_spgemm_info(); - - auto a_nnz = static_cast(a->get_num_stored_elements()); - auto a_vals = a->get_const_values(); - auto a_row_ptrs = a->get_const_row_ptrs(); - auto a_col_idxs = a->get_const_col_idxs(); - auto b_nnz = static_cast(b->get_num_stored_elements()); - auto b_vals = b->get_const_values(); - auto b_row_ptrs = b->get_const_row_ptrs(); - auto b_col_idxs = b->get_const_col_idxs(); - auto d_vals = d->get_const_values(); - auto d_row_ptrs = d->get_const_row_ptrs(); - auto d_col_idxs = d->get_const_col_idxs(); - auto null_value = static_cast(nullptr); - auto null_index = static_cast(nullptr); - auto one_value = one(); - auto m = static_cast(a->get_size()[0]); - auto n = static_cast(b->get_size()[1]); - auto k = static_cast(a->get_size()[1]); - - // allocate buffer - size_type buffer_size{}; - sparselib::spgemm_buffer_size( - handle, m, n, k, &one_value, a_descr, a_nnz, a_row_ptrs, a_col_idxs, - b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr, - IndexType{}, null_index, null_index, info, buffer_size); - array buffer_array(exec, buffer_size); - auto buffer = buffer_array.get_data(); - - // count nnz - array c_tmp_row_ptrs_array(exec, m + 1); - auto c_tmp_row_ptrs = c_tmp_row_ptrs_array.get_data(); - IndexType c_nnz{}; - sparselib::spgemm_nnz( - handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr, - b_nnz, b_row_ptrs, b_col_idxs, d_descr, IndexType{}, null_index, - null_index, c_descr, c_tmp_row_ptrs, &c_nnz, info, buffer); - - // accumulate non-zeros for A * B - array c_tmp_col_idxs_array(exec, c_nnz); - array c_tmp_vals_array(exec, c_nnz); - auto c_tmp_col_idxs = c_tmp_col_idxs_array.get_data(); - auto c_tmp_vals = c_tmp_vals_array.get_data(); - sparselib::spgemm(handle, m, n, k, &one_value, a_descr, a_nnz, a_vals, - a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, - b_row_ptrs, b_col_idxs, null_value, d_descr, - IndexType{}, null_value, null_index, null_index, - c_descr, c_tmp_vals, c_tmp_row_ptrs, c_tmp_col_idxs, - info, buffer); - - // destroy hipsparse context - sparselib::destroy_spgemm_info(info); - sparselib::destroy(d_descr); - sparselib::destroy(c_descr); - sparselib::destroy(b_descr); - sparselib::destroy(a_descr); - - auto total_nnz = c_nnz + d->get_num_stored_elements(); - auto nnz_per_row = total_nnz / m; - select_spgeam( - spgeam_kernels(), - [&](int compiled_subwarp_size) { - return compiled_subwarp_size >= nnz_per_row || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, - alpha->get_const_values(), c_tmp_row_ptrs, c_tmp_col_idxs, - c_tmp_vals, beta->get_const_values(), d_row_ptrs, d_col_idxs, - d_vals, c); - } else { - GKO_NOT_IMPLEMENTED; - } -} - - -template -void transpose(std::shared_ptr exec, - const matrix::Csr* orig, - matrix::Csr* trans) -{ - if (orig->get_size()[0] == 0) { - return; - } - if (sparselib::is_supported::value) { - hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC; - hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO; - - sparselib::transpose( - exec->get_sparselib_handle(), orig->get_size()[0], - orig->get_size()[1], orig->get_num_stored_elements(), - orig->get_const_values(), orig->get_const_row_ptrs(), - orig->get_const_col_idxs(), trans->get_values(), - trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase); - } else { - fallback_transpose(exec, orig, trans); - } -} - - -template -void conj_transpose(std::shared_ptr exec, - const matrix::Csr* orig, - matrix::Csr* trans) -{ - if (orig->get_size()[0] == 0) { - return; - } - const auto block_size = default_block_size; - const auto grid_size = - ceildiv(trans->get_num_stored_elements(), block_size); - if (sparselib::is_supported::value) { - hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC; - hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO; - - sparselib::transpose( - exec->get_sparselib_handle(), orig->get_size()[0], - orig->get_size()[1], orig->get_num_stored_elements(), - orig->get_const_values(), orig->get_const_row_ptrs(), - orig->get_const_col_idxs(), trans->get_values(), - trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase); - } else { - fallback_transpose(exec, orig, trans); - } - if (grid_size > 0 && is_complex()) { - kernel::conjugate<<get_stream()>>>( - trans->get_num_stored_elements(), - as_device_type(trans->get_values())); - } -} - - -template -void sort_by_column_index(std::shared_ptr exec, - matrix::Csr* to_sort) -{ - if (sparselib::is_supported::value) { - auto handle = exec->get_sparselib_handle(); - auto descr = sparselib::create_mat_descr(); - auto m = IndexType(to_sort->get_size()[0]); - auto n = IndexType(to_sort->get_size()[1]); - auto nnz = IndexType(to_sort->get_num_stored_elements()); - auto row_ptrs = to_sort->get_const_row_ptrs(); - auto col_idxs = to_sort->get_col_idxs(); - auto vals = to_sort->get_values(); - - // copy values - array tmp_vals_array(exec, nnz); - exec->copy(nnz, vals, tmp_vals_array.get_data()); - auto tmp_vals = tmp_vals_array.get_const_data(); - - // init identity permutation - array permutation_array(exec, nnz); - auto permutation = permutation_array.get_data(); - components::fill_seq_array(exec, permutation, nnz); - - // allocate buffer - size_type buffer_size{}; - sparselib::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs, - buffer_size); - array buffer_array{exec, buffer_size}; - auto buffer = buffer_array.get_data(); - - // sort column indices - sparselib::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs, - permutation, buffer); - - // sort values - sparselib::gather(handle, nnz, tmp_vals, vals, permutation); - - sparselib::destroy(descr); - } else { - fallback_sort(exec, to_sort); - } -} - - -} // namespace csr -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko From c063bfa1c4a095508541e775cfa0e3f5e045e96f Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 15 Jul 2024 22:30:46 +0200 Subject: [PATCH 093/448] remove solver_progress reliance on uninitialized values --- core/test/log/solver_progress.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/core/test/log/solver_progress.cpp b/core/test/log/solver_progress.cpp index f2433779864..fe8a4537f66 100644 --- a/core/test/log/solver_progress.cpp +++ b/core/test/log/solver_progress.cpp @@ -86,7 +86,7 @@ TYPED_TEST(SolverProgress, TableWorks) << std::setw(default_column_width) << "implicit_sq_residual_norm" << '\n'; ref_ss << std::setw(default_column_width) << 0 - << std::setw(default_column_width) << T{0.0} + << std::setw(default_column_width) << T{4.0} << std::setw(default_column_width) << T{1.0} << std::setw(default_column_width) << T{4.0} << std::setw(default_column_width) << T{4.0} << '\n' @@ -95,18 +95,16 @@ TYPED_TEST(SolverProgress, TableWorks) << std::setw(default_column_width) << T{0.0} << std::setw(default_column_width) << T{4.0} << std::setw(default_column_width) << T{0.0} << '\n'; + // run the solve once so the internal vectors are initialized before + // attaching the logger + this->solver->apply(this->in, this->out->clone()); std::stringstream ss; this->solver->add_logger( gko::log::SolverProgress::create_scalar_table_writer(ss)); this->solver->apply(this->in, this->out); - // the first value of beta is uninitialized, so we need to remove it - std::regex first_beta("\n 0 *[()0-9.e,+-]*"); - auto clean_str = std::regex_replace(ss.str(), first_beta, "\n 0"); - auto clean_ref = - std::regex_replace(ref_ss.str(), first_beta, "\n 0"); - ASSERT_EQ(clean_str, clean_ref); + ASSERT_EQ(ss.str(), ref_ss.str()); } @@ -119,21 +117,20 @@ TYPED_TEST(SolverProgress, CsvWorks) << this->out.get() << ") of dimensions " << this->solver->get_size() << " and " << this->in->get_size()[1] << " rhs\n"; ref_ss << "Iteration;beta;prev_rho;rho;implicit_sq_residual_norm" << '\n'; - ref_ss << 0 << ';' << T{0.0} << ';' << T{1.0} << ';' << T{4.0} << ';' + ref_ss << 0 << ';' << T{4.0} << ';' << T{1.0} << ';' << T{4.0} << ';' << T{4.0} << '\n' << 1 << ';' << T{4.0} << ';' << T{0.0} << ';' << T{4.0} << ';' << T{0.0} << '\n'; + // run the solve once so the internal vectors are initialized before + // attaching the logger + this->solver->apply(this->in, this->out->clone()); std::stringstream ss; this->solver->add_logger( gko::log::SolverProgress::create_scalar_csv_writer(ss, 6, ';')); this->solver->apply(this->in, this->out); - // the first value of beta is uninitialized, so we need to remove it - std::regex first_beta("\n0;[^;]*"); - auto clean_str = std::regex_replace(ss.str(), first_beta, "\n0;"); - auto clean_ref = std::regex_replace(ref_ss.str(), first_beta, "\n0;"); - ASSERT_EQ(clean_str, clean_ref); + ASSERT_EQ(ss.str(), ref_ss.str()); } @@ -171,6 +168,9 @@ TYPED_TEST(SolverProgress, StorageWorks) {"solver_progress_test_initial_guess", orig_out.get()}, {"solver_progress_test_rhs", this->in.get()}, {"solver_progress_test_system_matrix", this->mtx.get()}}; + // run the solve once so the internal vectors are initialized before + // attaching the logger + this->solver->apply(this->in, this->out->clone()); this->solver->add_logger(gko::log::SolverProgress::create_vector_storage( "solver_progress_test", false)); this->solver->add_logger(gko::log::SolverProgress::create_vector_storage( From c2511808ac7b8ef4fba149452b939bb6bfa9f114 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 15 Jul 2024 22:31:35 +0200 Subject: [PATCH 094/448] formatting --- common/cuda_hip/matrix/csr_kernels.template.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp index eda0e856b07..f17cf1548fe 100644 --- a/common/cuda_hip/matrix/csr_kernels.template.cpp +++ b/common/cuda_hip/matrix/csr_kernels.template.cpp @@ -84,6 +84,8 @@ using spgeam_kernels = #include "common/cuda_hip/matrix/csr_common.hpp.inc" + + namespace kernel { From e810e036fd578ef53be3fa4ebe7cb326867aa8ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= Date: Wed, 10 Jul 2024 17:52:20 +0200 Subject: [PATCH 095/448] Add additional tests for the communicator group --- cuda/test/components/cooperative_groups.cu | 37 ++++++++++++++++ .../test/components/cooperative_groups.dp.cpp | 42 +++++++++++++++++++ .../components/cooperative_groups.hip.cpp | 37 ++++++++++++++++ 3 files changed, 116 insertions(+) diff --git a/cuda/test/components/cooperative_groups.cu b/cuda/test/components/cooperative_groups.cu index df3cef86bb8..077b0121fbd 100644 --- a/cuda/test/components/cooperative_groups.cu +++ b/cuda/test/components/cooperative_groups.cu @@ -223,4 +223,41 @@ TEST_F(CooperativeGroups, SubwarpBallot) { test(cg_subwarp_ballot); } TEST_F(CooperativeGroups, SubwarpBallot2) { test_subwarp(cg_subwarp_ballot); } +__global__ void cg_communicator_categorization(bool*) +{ + auto this_block = group::this_thread_block(); + auto tiled_partition = + group::tiled_partition(this_block); + auto subwarp_partition = group::tiled_partition(this_block); + + using not_group = int; + using this_block_t = decltype(this_block); + using tiled_partition_t = decltype(tiled_partition); + using subwarp_partition_t = decltype(subwarp_partition); + + static_assert(!group::is_group::value && + group::is_group::value && + group::is_group::value && + group::is_group::value, + "Group check doesn't work."); + static_assert( + !group::is_synchronizable_group::value && + group::is_synchronizable_group::value && + group::is_synchronizable_group::value && + group::is_synchronizable_group::value, + "Synchronizable group check doesn't work."); + static_assert( + !group::is_communicator_group::value && + !group::is_communicator_group::value && + group::is_communicator_group::value && + group::is_communicator_group::value, + "Communicator group check doesn't work."); +} + +TEST_F(CooperativeGroups, CorrectCategorization) +{ + test(cg_communicator_categorization); +} + + } // namespace diff --git a/dpcpp/test/components/cooperative_groups.dp.cpp b/dpcpp/test/components/cooperative_groups.dp.cpp index 27e14b62d2d..8667a85713e 100644 --- a/dpcpp/test/components/cooperative_groups.dp.cpp +++ b/dpcpp/test/components/cooperative_groups.dp.cpp @@ -198,6 +198,48 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(cg_ballot_call, cg_ballot, default_config_list) TEST_P(CooperativeGroups, Ballot) { test_all_subgroup(cg_ballot_call); } +template +void cg_communicator_categorization(bool* s, sycl::nd_item<3> item_ct1) +{ + auto this_block = group::this_thread_block(item_ct1); + auto tiled_partition = + group::tiled_partition(this_block); + + using not_group = int; + using this_block_t = decltype(this_block); + using tiled_partition_t = decltype(tiled_partition); + + static_assert(!group::is_group::value && + group::is_group::value && + group::is_group::value, + "Group check doesn't work."); + static_assert( + !group::is_synchronizable_group::value && + group::is_synchronizable_group::value && + group::is_synchronizable_group::value, + "Synchronizable group check doesn't work."); + static_assert( + !group::is_communicator_group::value && + !group::is_communicator_group::value && + group::is_communicator_group::value, + "Communicator group check doesn't work."); + s[this_block.thread_rank()] = true; +} + +GKO_ENABLE_DEFAULT_HOST_CONFIG_TYPE(cg_communicator_categorization, + cg_communicator_categorization) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION_TOTYPE( + cg_communicator_categorization, cg_communicator_categorization, DCFG_1D) +GKO_ENABLE_DEFAULT_CONFIG_CALL(cg_communicator_categorization_call, + cg_communicator_categorization, + default_config_list) + +TEST_P(CooperativeGroups, CorrectCategorization) +{ + test_all_subgroup(cg_communicator_categorization_call); +} + + INSTANTIATE_TEST_SUITE_P(DifferentSubgroup, CooperativeGroups, testing::Values(4, 8, 16, 32, 64), testing::PrintToStringParamName()); diff --git a/hip/test/components/cooperative_groups.hip.cpp b/hip/test/components/cooperative_groups.hip.cpp index 06a104a8879..0f71550139c 100644 --- a/hip/test/components/cooperative_groups.hip.cpp +++ b/hip/test/components/cooperative_groups.hip.cpp @@ -242,6 +242,43 @@ TEST_F(CooperativeGroups, SubwarpBallot) { test(cg_subwarp_ballot); } TEST_F(CooperativeGroups, SubwarpBallot2) { test_subwarp(cg_subwarp_ballot); } +__global__ void cg_communicator_categorization(bool*) +{ + auto this_block = group::this_thread_block(); + auto tiled_partition = + group::tiled_partition(this_block); + auto subwarp_partition = group::tiled_partition(this_block); + + using not_group = int; + using this_block_t = decltype(this_block); + using tiled_partition_t = decltype(tiled_partition); + using subwarp_partition_t = decltype(subwarp_partition); + + static_assert(!group::is_group::value && + group::is_group::value && + group::is_group::value && + group::is_group::value, + "Group check doesn't work."); + static_assert( + !group::is_synchronizable_group::value && + group::is_synchronizable_group::value && + group::is_synchronizable_group::value && + group::is_synchronizable_group::value, + "Synchronizable group check doesn't work."); + static_assert( + !group::is_communicator_group::value && + !group::is_communicator_group::value && + group::is_communicator_group::value && + group::is_communicator_group::value, + "Communicator group check doesn't work."); +} + +TEST_F(CooperativeGroups, CorrectCategorization) +{ + test(cg_communicator_categorization); +} + + template __global__ void cg_shuffle_sum(const int num, ValueType* __restrict__ value) { From 24461b0433de34333237fab5c56b63a79be8e32b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= Date: Thu, 11 Jul 2024 14:13:54 +0200 Subject: [PATCH 096/448] Fix visibility of HIP specialization --- .../test/components/cooperative_groups.dp.cpp | 21 ++++++++++--------- hip/components/cooperative_groups.hip.hpp | 6 +++--- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/dpcpp/test/components/cooperative_groups.dp.cpp b/dpcpp/test/components/cooperative_groups.dp.cpp index 8667a85713e..eadd99a6ac5 100644 --- a/dpcpp/test/components/cooperative_groups.dp.cpp +++ b/dpcpp/test/components/cooperative_groups.dp.cpp @@ -213,17 +213,18 @@ void cg_communicator_categorization(bool* s, sycl::nd_item<3> item_ct1) group::is_group::value && group::is_group::value, "Group check doesn't work."); - static_assert( - !group::is_synchronizable_group::value && - group::is_synchronizable_group::value && - group::is_synchronizable_group::value, - "Synchronizable group check doesn't work."); - static_assert( - !group::is_communicator_group::value && - !group::is_communicator_group::value && - group::is_communicator_group::value, - "Communicator group check doesn't work."); + static_assert(!group::is_synchronizable_group::value && + group::is_synchronizable_group::value && + group::is_synchronizable_group::value, + "Synchronizable group check doesn't work."); + static_assert(!group::is_communicator_group::value && + !group::is_communicator_group::value && + group::is_communicator_group::value, + "Communicator group check doesn't work."); + // Make it work with the test framework, which performs 3 tests s[this_block.thread_rank()] = true; + s[this_block.thread_rank() + cfg::subgroup_size] = true; + s[this_block.thread_rank() + 2 * cfg::subgroup_size] = true; } GKO_ENABLE_DEFAULT_HOST_CONFIG_TYPE(cg_communicator_categorization, diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp index d3dbc44a5c8..11581db0b0c 100644 --- a/hip/components/cooperative_groups.hip.hpp +++ b/hip/components/cooperative_groups.hip.hpp @@ -370,12 +370,12 @@ namespace detail { template -struct is_group_impl> : std::true_type {}; +struct is_group_impl> : std::true_type {}; template -struct is_synchronizable_group_impl> : std::true_type { +struct is_synchronizable_group_impl> : std::true_type { }; template -struct is_communicator_group_impl> : std::true_type {}; +struct is_communicator_group_impl> : std::true_type {}; } // namespace detail From 53a81188071e2280231fa12fdbd4d1c4ea27d8a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= Date: Wed, 10 Jul 2024 17:54:53 +0200 Subject: [PATCH 097/448] Fix the communicator group categorization --- cuda/components/cooperative_groups.cuh | 2 +- dpcpp/components/cooperative_groups.dp.hpp | 2 +- hip/components/cooperative_groups.hip.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh index c4ceca9e409..983ec32f9ac 100644 --- a/cuda/components/cooperative_groups.cuh +++ b/cuda/components/cooperative_groups.cuh @@ -113,7 +113,7 @@ struct is_synchronizable_group_impl : std::false_type {}; template -struct is_communicator_group_impl : std::true_type {}; +struct is_communicator_group_impl : std::false_type {}; } // namespace detail diff --git a/dpcpp/components/cooperative_groups.dp.hpp b/dpcpp/components/cooperative_groups.dp.hpp index c758cf42710..33a107ef3f5 100644 --- a/dpcpp/components/cooperative_groups.dp.hpp +++ b/dpcpp/components/cooperative_groups.dp.hpp @@ -101,7 +101,7 @@ struct is_synchronizable_group_impl : std::false_type {}; template -struct is_communicator_group_impl : std::true_type {}; +struct is_communicator_group_impl : std::false_type {}; } // namespace detail diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp index 11581db0b0c..2e5d7c0abff 100644 --- a/hip/components/cooperative_groups.hip.hpp +++ b/hip/components/cooperative_groups.hip.hpp @@ -101,7 +101,7 @@ struct is_synchronizable_group_impl : std::false_type {}; template -struct is_communicator_group_impl : std::true_type {}; +struct is_communicator_group_impl : std::false_type {}; } // namespace detail From 605889a539ab21b7b2a41ff465f06e63d3da2df9 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Thu, 11 Jul 2024 12:28:28 +0000 Subject: [PATCH 098/448] Format files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Thomas Grützmacher --- cuda/test/components/cooperative_groups.cu | 11 +++++------ hip/components/cooperative_groups.hip.hpp | 7 ++++--- hip/test/components/cooperative_groups.hip.cpp | 11 +++++------ 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/cuda/test/components/cooperative_groups.cu b/cuda/test/components/cooperative_groups.cu index 077b0121fbd..0b384cd704e 100644 --- a/cuda/test/components/cooperative_groups.cu +++ b/cuda/test/components/cooperative_groups.cu @@ -246,12 +246,11 @@ __global__ void cg_communicator_categorization(bool*) group::is_synchronizable_group::value && group::is_synchronizable_group::value, "Synchronizable group check doesn't work."); - static_assert( - !group::is_communicator_group::value && - !group::is_communicator_group::value && - group::is_communicator_group::value && - group::is_communicator_group::value, - "Communicator group check doesn't work."); + static_assert(!group::is_communicator_group::value && + !group::is_communicator_group::value && + group::is_communicator_group::value && + group::is_communicator_group::value, + "Communicator group check doesn't work."); } TEST_F(CooperativeGroups, CorrectCategorization) diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp index 2e5d7c0abff..36618bb7f3e 100644 --- a/hip/components/cooperative_groups.hip.hpp +++ b/hip/components/cooperative_groups.hip.hpp @@ -372,10 +372,11 @@ namespace detail { template struct is_group_impl> : std::true_type {}; template -struct is_synchronizable_group_impl> : std::true_type { -}; +struct is_synchronizable_group_impl> + : std::true_type {}; template -struct is_communicator_group_impl> : std::true_type {}; +struct is_communicator_group_impl> + : std::true_type {}; } // namespace detail diff --git a/hip/test/components/cooperative_groups.hip.cpp b/hip/test/components/cooperative_groups.hip.cpp index 0f71550139c..bd8c79b9849 100644 --- a/hip/test/components/cooperative_groups.hip.cpp +++ b/hip/test/components/cooperative_groups.hip.cpp @@ -265,12 +265,11 @@ __global__ void cg_communicator_categorization(bool*) group::is_synchronizable_group::value && group::is_synchronizable_group::value, "Synchronizable group check doesn't work."); - static_assert( - !group::is_communicator_group::value && - !group::is_communicator_group::value && - group::is_communicator_group::value && - group::is_communicator_group::value, - "Communicator group check doesn't work."); + static_assert(!group::is_communicator_group::value && + !group::is_communicator_group::value && + group::is_communicator_group::value && + group::is_communicator_group::value, + "Communicator group check doesn't work."); } TEST_F(CooperativeGroups, CorrectCategorization) From ffbe84f3500cc39409a1b4398f85e3bd2b511866 Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Wed, 3 Jul 2024 11:46:14 +0000 Subject: [PATCH 099/448] Add row and col scaling functions to distributed matrix --- core/distributed/matrix.cpp | 50 ++++++++++++ include/ginkgo/core/distributed/matrix.hpp | 18 +++++ test/mpi/matrix.cpp | 93 ++++++++++++++++++++++ 3 files changed, 161 insertions(+) diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 8eee020a3e6..0b9c06f761d 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include "core/distributed/matrix_kernels.hpp" @@ -504,6 +505,55 @@ void Matrix::apply_impl( } +template +void Matrix::col_scale( + ptr_param scaling_factors) +{ + GKO_ASSERT_CONFORMANT(this, scaling_factors.get()); + auto exec = this->get_executor(); + auto comm = this->get_communicator(); + size_type n_local_cols = local_mtx_->get_size()[1]; + size_type n_non_local_cols = non_local_mtx_->get_size()[1]; + const auto scale_diag = gko::matrix::Diagonal::create_const( + exec, n_local_cols, + make_const_array_view(exec, n_local_cols, + scaling_factors->get_const_local_values())); + + auto req = this->communicate(scaling_factors->get_local_vector()); + scale_diag->rapply(local_mtx_, local_mtx_); + req.wait(); + if (n_non_local_cols > 0) { + auto use_host_buffer = mpi::requires_host_buffer(exec, comm); + if (use_host_buffer) { + recv_buffer_->copy_from(host_recv_buffer_.get()); + } + const auto non_local_scale_diag = + gko::matrix::Diagonal::create_const( + exec, n_non_local_cols, + make_const_array_view(exec, n_non_local_cols, + recv_buffer_->get_const_values())); + non_local_scale_diag->rapply(non_local_mtx_, non_local_mtx_); + } +} + + +template +void Matrix::row_scale( + ptr_param scaling_factors) +{ + GKO_ASSERT_EQUAL_ROWS(this, scaling_factors.get()); + auto exec = this->get_executor(); + size_type n_local_rows = local_mtx_->get_size()[0]; + const auto scale_diag = gko::matrix::Diagonal::create_const( + exec, n_local_rows, + make_const_array_view(exec, n_local_rows, + scaling_factors->get_const_local_values())); + + scale_diag->apply(local_mtx_, local_mtx_); + scale_diag->apply(non_local_mtx_, non_local_mtx_); +} + + template Matrix::Matrix(const Matrix& other) : EnableDistributedLinOp recv_offsets, array recv_gather_idxs); + /** + * Scales the columns of the matrix by the respective entries of the vector. + * The vector's row partition has to be the same as the matrix's column + * partition. The scaling is done in-place. + * + * @param scaling_factors The vector containing the scaling factors. + */ + void col_scale(ptr_param scaling_factors); + + /** + * Scales the rows of the matrix by the respective entries of the vector. + * The vector and the matrix have to have the same row partition. + * The scaling is done in-place. + * + * @param scaling_factors The vector containing the scaling factors. + */ + void row_scale(ptr_param scaling_factors); + protected: explicit Matrix(std::shared_ptr exec, mpi::communicator comm); diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index cc9ec219a88..7af6f537fb3 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -516,6 +516,99 @@ TYPED_TEST(Matrix, CanAdvancedApplyToMultipleVectorsLarge) } +TYPED_TEST(Matrix, CanColScale) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::global_index_type; + using csr = typename TestFixture::local_matrix_type; + using dist_vec_type = typename TestFixture::dist_vec_type; + auto vec_md = gko::matrix_data{ + I>{{1}, {2}, {3}, {4}, {5}}}; + I> res_col_scale_local[] = { + {{8, 0}, {0, 0}}, {{0, 10}, {0, 0}}, {{0}}}; + I> res_col_scale_non_local[] = { + {{2, 0}, {6, 12}}, {{0, 0, 18}, {32, 35, 0}}, {{50, 9}}}; + auto rank = this->comm.rank(); + auto col_scaling_factors = dist_vec_type::create(this->exec, this->comm); + col_scaling_factors->read_distributed(vec_md, this->col_part); + + this->dist_mat->col_scale(col_scaling_factors); + + GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_local_matrix()), + res_col_scale_local[rank], 0); + GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_non_local_matrix()), + res_col_scale_non_local[rank], 0); +} + + +TYPED_TEST(Matrix, CanRowScale) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::global_index_type; + using csr = typename TestFixture::local_matrix_type; + using dist_vec_type = typename TestFixture::dist_vec_type; + auto vec_md = gko::matrix_data{ + I>{{1}, {2}, {3}, {4}, {5}}}; + I> res_row_scale_local[] = { + {{2, 0}, {0, 0}}, {{0, 15}, {0, 0}}, {{0}}}; + I> res_row_scale_non_local[] = { + {{1, 0}, {6, 8}}, {{0, 0, 18}, {32, 28, 0}}, {{50, 45}}}; + auto rank = this->comm.rank(); + auto row_scaling_factors = dist_vec_type::create(this->exec, this->comm); + row_scaling_factors->read_distributed(vec_md, this->row_part); + + this->dist_mat->row_scale(row_scaling_factors); + + GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_local_matrix()), + res_row_scale_local[rank], 0); + GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_non_local_matrix()), + res_row_scale_non_local[rank], 0); +} + + +TYPED_TEST(Matrix, ColScaleThrowsOnWrongDimension) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::global_index_type; + using dist_vec_type = typename TestFixture::dist_vec_type; + using part_type = typename TestFixture::part_type; + auto vec_md = gko::matrix_data{ + I>{{1}, {2}, {3}, {4}}}; + auto rank = this->comm.rank(); + auto col_part = part_type::build_from_mapping( + this->exec, + gko::array( + this->exec, + I{1, 2, 0, 0}), + 3); + auto col_scaling_factors = dist_vec_type::create(this->exec, this->comm); + col_scaling_factors->read_distributed(vec_md, col_part); + + ASSERT_THROW(this->dist_mat->col_scale(col_scaling_factors), + gko::DimensionMismatch); +} + + +TYPED_TEST(Matrix, RowScaleThrowsOnWrongDimension) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::global_index_type; + using dist_vec_type = typename TestFixture::dist_vec_type; + using part_type = typename TestFixture::part_type; + auto vec_md = gko::matrix_data{ + I>{{1}, {2}, {3}, {4}}}; + auto rank = this->comm.rank(); + auto row_part = part_type::build_from_contiguous( + this->exec, + gko::array(this->exec, I{0, 2, 3, 4})); + auto row_scaling_factors = dist_vec_type::create(this->exec, this->comm); + row_scaling_factors->read_distributed(vec_md, row_part); + + ASSERT_THROW(this->dist_mat->row_scale(row_scaling_factors), + gko::DimensionMismatch); +} + + TYPED_TEST(Matrix, CanConvertToNextPrecision) { using T = typename TestFixture::value_type; From 25fa5be961a471354214355f85a773260511df38 Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Tue, 9 Jul 2024 08:13:47 +0000 Subject: [PATCH 100/448] Enable row / column scaling with strided vectors --- core/distributed/matrix.cpp | 31 +++++++++++++--- test/mpi/matrix.cpp | 71 +++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 5 deletions(-) diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 0b9c06f761d..63f359cc40a 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -510,16 +510,27 @@ void Matrix::col_scale( ptr_param scaling_factors) { GKO_ASSERT_CONFORMANT(this, scaling_factors.get()); + GKO_ASSERT_EQ(scaling_factors->get_size()[1], 1); auto exec = this->get_executor(); auto comm = this->get_communicator(); size_type n_local_cols = local_mtx_->get_size()[1]; size_type n_non_local_cols = non_local_mtx_->get_size()[1]; + std::unique_ptr scaling_factors_single_stride; + auto stride = scaling_factors->get_stride(); + if (stride != 1) { + scaling_factors_single_stride = global_vector_type::create(exec, comm); + scaling_factors_single_stride->copy_from(scaling_factors.get()); + } + const auto scale_values = + stride == 1 ? scaling_factors->get_const_local_values() + : scaling_factors_single_stride->get_const_local_values(); const auto scale_diag = gko::matrix::Diagonal::create_const( exec, n_local_cols, - make_const_array_view(exec, n_local_cols, - scaling_factors->get_const_local_values())); + make_const_array_view(exec, n_local_cols, scale_values)); - auto req = this->communicate(scaling_factors->get_local_vector()); + auto req = this->communicate( + stride == 1 ? scaling_factors->get_local_vector() + : scaling_factors_single_stride->get_local_vector()); scale_diag->rapply(local_mtx_, local_mtx_); req.wait(); if (n_non_local_cols > 0) { @@ -542,12 +553,22 @@ void Matrix::row_scale( ptr_param scaling_factors) { GKO_ASSERT_EQUAL_ROWS(this, scaling_factors.get()); + GKO_ASSERT_EQ(scaling_factors->get_size()[1], 1); auto exec = this->get_executor(); + auto comm = this->get_communicator(); size_type n_local_rows = local_mtx_->get_size()[0]; + std::unique_ptr scaling_factors_single_stride; + auto stride = scaling_factors->get_stride(); + if (stride != 1) { + scaling_factors_single_stride = global_vector_type::create(exec, comm); + scaling_factors_single_stride->copy_from(scaling_factors.get()); + } + const auto scale_values = + stride == 1 ? scaling_factors->get_const_local_values() + : scaling_factors_single_stride->get_const_local_values(); const auto scale_diag = gko::matrix::Diagonal::create_const( exec, n_local_rows, - make_const_array_view(exec, n_local_rows, - scaling_factors->get_const_local_values())); + make_const_array_view(exec, n_local_rows, scale_values)); scale_diag->apply(local_mtx_, local_mtx_); scale_diag->apply(non_local_mtx_, non_local_mtx_); diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index 7af6f537fb3..454197ccfd9 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -20,6 +20,7 @@ #include #include "core/test/utils.hpp" +#include "ginkgo/core/base/exception.hpp" #include "test/utils/mpi/common_fixture.hpp" @@ -566,6 +567,62 @@ TYPED_TEST(Matrix, CanRowScale) } +TYPED_TEST(Matrix, CanColScaleWithStride) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::global_index_type; + using csr = typename TestFixture::local_matrix_type; + using dist_vec_type = typename TestFixture::dist_vec_type; + auto vec_md = gko::matrix_data{ + I>{{1}, {2}, {3}, {4}, {5}}}; + I> res_col_scale_local[] = { + {{8, 0}, {0, 0}}, {{0, 10}, {0, 0}}, {{0}}}; + I> res_col_scale_non_local[] = { + {{2, 0}, {6, 12}}, {{0, 0, 18}, {32, 35, 0}}, {{50, 9}}}; + gko::dim<2> local_sizes[] = {{2, 1}, {2, 1}, {1, 1}}; + auto rank = this->comm.rank(); + auto col_scaling_factors = dist_vec_type::create( + this->exec, this->comm, gko::dim<2>{5, 1}, local_sizes[rank], 2); + col_scaling_factors->read_distributed(vec_md, this->col_part); + + this->dist_mat->col_scale(col_scaling_factors); + + GKO_ASSERT_EQ(col_scaling_factors->get_stride(), 2); + GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_local_matrix()), + res_col_scale_local[rank], 0); + GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_non_local_matrix()), + res_col_scale_non_local[rank], 0); +} + + +TYPED_TEST(Matrix, CanRowScaleWithStride) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::global_index_type; + using csr = typename TestFixture::local_matrix_type; + using dist_vec_type = typename TestFixture::dist_vec_type; + auto vec_md = gko::matrix_data{ + I>{{1}, {2}, {3}, {4}, {5}}}; + I> res_row_scale_local[] = { + {{2, 0}, {0, 0}}, {{0, 15}, {0, 0}}, {{0}}}; + I> res_row_scale_non_local[] = { + {{1, 0}, {6, 8}}, {{0, 0, 18}, {32, 28, 0}}, {{50, 45}}}; + gko::dim<2> local_sizes[] = {{2, 1}, {2, 1}, {1, 1}}; + auto rank = this->comm.rank(); + auto row_scaling_factors = dist_vec_type::create( + this->exec, this->comm, gko::dim<2>{5, 1}, local_sizes[rank], 2); + row_scaling_factors->read_distributed(vec_md, this->row_part); + + this->dist_mat->row_scale(row_scaling_factors); + + GKO_ASSERT_EQ(row_scaling_factors->get_stride(), 2); + GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_local_matrix()), + res_row_scale_local[rank], 0); + GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_non_local_matrix()), + res_row_scale_non_local[rank], 0); +} + + TYPED_TEST(Matrix, ColScaleThrowsOnWrongDimension) { using value_type = typename TestFixture::value_type; @@ -574,6 +631,8 @@ TYPED_TEST(Matrix, ColScaleThrowsOnWrongDimension) using part_type = typename TestFixture::part_type; auto vec_md = gko::matrix_data{ I>{{1}, {2}, {3}, {4}}}; + auto two_vec_md = gko::matrix_data{ + I>{{1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}}}; auto rank = this->comm.rank(); auto col_part = part_type::build_from_mapping( this->exec, @@ -583,9 +642,14 @@ TYPED_TEST(Matrix, ColScaleThrowsOnWrongDimension) 3); auto col_scaling_factors = dist_vec_type::create(this->exec, this->comm); col_scaling_factors->read_distributed(vec_md, col_part); + auto two_col_scaling_factors = + dist_vec_type::create(this->exec, this->comm); + two_col_scaling_factors->read_distributed(two_vec_md, this->col_part); ASSERT_THROW(this->dist_mat->col_scale(col_scaling_factors), gko::DimensionMismatch); + ASSERT_THROW(this->dist_mat->col_scale(two_col_scaling_factors), + gko::ValueMismatch); } @@ -597,15 +661,22 @@ TYPED_TEST(Matrix, RowScaleThrowsOnWrongDimension) using part_type = typename TestFixture::part_type; auto vec_md = gko::matrix_data{ I>{{1}, {2}, {3}, {4}}}; + auto two_vec_md = gko::matrix_data{ + I>{{1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}}}; auto rank = this->comm.rank(); auto row_part = part_type::build_from_contiguous( this->exec, gko::array(this->exec, I{0, 2, 3, 4})); auto row_scaling_factors = dist_vec_type::create(this->exec, this->comm); row_scaling_factors->read_distributed(vec_md, row_part); + auto two_row_scaling_factors = + dist_vec_type::create(this->exec, this->comm); + two_row_scaling_factors->read_distributed(two_vec_md, this->col_part); ASSERT_THROW(this->dist_mat->row_scale(row_scaling_factors), gko::DimensionMismatch); + ASSERT_THROW(this->dist_mat->row_scale(two_row_scaling_factors), + gko::ValueMismatch); } From f91b4271de9138da0f7fc469eef1a86af5cc35c3 Mon Sep 17 00:00:00 2001 From: fritzgoebel Date: Thu, 11 Jul 2024 15:17:34 +0200 Subject: [PATCH 101/448] Apply suggestions from code review Co-authored-by: Tobias Ribizel --- test/mpi/matrix.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index 454197ccfd9..1c090b6c43f 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -587,7 +587,7 @@ TYPED_TEST(Matrix, CanColScaleWithStride) this->dist_mat->col_scale(col_scaling_factors); - GKO_ASSERT_EQ(col_scaling_factors->get_stride(), 2); + ASSERT_EQ(col_scaling_factors->get_stride(), 2); GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_local_matrix()), res_col_scale_local[rank], 0); GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_non_local_matrix()), @@ -615,7 +615,7 @@ TYPED_TEST(Matrix, CanRowScaleWithStride) this->dist_mat->row_scale(row_scaling_factors); - GKO_ASSERT_EQ(row_scaling_factors->get_stride(), 2); + ASSERT_EQ(row_scaling_factors->get_stride(), 2); GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_local_matrix()), res_row_scale_local[rank], 0); GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_non_local_matrix()), From 7daba3684a5baad3a3f2603487246f8443637d90 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 30 Jul 2024 09:53:38 +0200 Subject: [PATCH 102/448] fix stride for GCR initialization --- common/unified/solver/gcr_kernels.cpp | 5 ++--- test/solver/gcr_kernels.cpp | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/common/unified/solver/gcr_kernels.cpp b/common/unified/solver/gcr_kernels.cpp index 0c9e825228a..7adef77dfb1 100644 --- a/common/unified/solver/gcr_kernels.cpp +++ b/common/unified/solver/gcr_kernels.cpp @@ -27,7 +27,7 @@ void initialize(std::shared_ptr exec, stopping_status* stop_status) { if (b->get_size()) { - run_kernel_solver( + run_kernel( exec, [] GKO_KERNEL(auto row, auto col, auto b, auto residual, auto stop) { @@ -36,8 +36,7 @@ void initialize(std::shared_ptr exec, } residual(row, col) = b(row, col); }, - b->get_size(), b->get_stride(), default_stride(b), - default_stride(residual), stop_status); + b->get_size(), b, residual, stop_status); } else { run_kernel( exec, [] GKO_KERNEL(auto col, auto stop) { stop[col].reset(); }, diff --git a/test/solver/gcr_kernels.cpp b/test/solver/gcr_kernels.cpp index 5a46bbbb940..eb3f5c6df93 100644 --- a/test/solver/gcr_kernels.cpp +++ b/test/solver/gcr_kernels.cpp @@ -157,6 +157,22 @@ TEST_F(Gcr, GcrKernelInitializeIsEquivalentToRef) } +TEST_F(Gcr, GcrKernelInitializeWithStrideIsEquivalentToRef) +{ + initialize_data(); + auto d_b_strided = Mtx::create(exec, b->get_size(), b->get_stride() + 2); + d_b_strided->copy_from(d_b); + + gko::kernels::reference::gcr::initialize(ref, b.get(), residual.get(), + stop_status.get_data()); + gko::kernels::GKO_DEVICE_NAMESPACE::gcr::initialize( + exec, d_b_strided.get(), d_residual.get(), d_stop_status.get_data()); + + GKO_ASSERT_MTX_NEAR(d_residual, residual, r::value); + GKO_ASSERT_ARRAY_EQ(d_stop_status, stop_status); +} + + TEST_F(Gcr, GcrKernelRestartIsEquivalentToRef) { initialize_data(); From c589751872cd906457250fb1d208babf7057d694 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <1289205+lahwaacz@users.noreply.github.com> Date: Tue, 6 Aug 2024 18:57:04 +0200 Subject: [PATCH 103/448] Include missing iomanip header in solver_progress.cpp test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this, compiling with g++ 14.2.1 fails with the following error: ``` /build/ginkgo-hpc-git/src/ginkgo/core/test/log/solver_progress.cpp: In member function ‘void SolverProgress_TableWorks_Test::TestBody()’: /build/ginkgo-hpc-git/src/ginkgo/core/test/log/solver_progress.cpp:82:20: error: ‘setw’ is not a member of ‘std’ 82 | ref_ss << std::setw(default_column_width) << "Iteration" | ^~~~ /build/ginkgo-hpc-git/src/ginkgo/core/test/log/solver_progress.cpp:15:1: note: ‘std::setw’ is defined in header ‘’; this is probably fixable by adding ‘#include ’ 14 | #include "core/test/utils.hpp" +++ |+#include 15 | #include "core/test/utils/assertions.hpp" /build/ginkgo-hpc-git/src/ginkgo/core/test/log/solver_progress.cpp:83:20: error: ‘setw’ is not a member of ‘std’ 83 | << std::setw(default_column_width) << "beta" | ^~~~ /build/ginkgo-hpc-git/src/ginkgo/core/test/log/solver_progress.cpp:83:20: note: ‘std::setw’ is defined in header ‘’; this is probably fixable by adding ‘#include ’ /build/ginkgo-hpc-git/src/ginkgo/core/test/log/solver_progress.cpp:84:20: error: ‘setw’ is not a member of ‘std’ 84 | << std::setw(default_column_width) << "prev_rho" | ^~~~ ... ``` --- core/test/log/solver_progress.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/core/test/log/solver_progress.cpp b/core/test/log/solver_progress.cpp index fe8a4537f66..e00044a908d 100644 --- a/core/test/log/solver_progress.cpp +++ b/core/test/log/solver_progress.cpp @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include #include #include From f3e68711e6d8cbd1e9985e79a9978ef0f56cdaea Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 29 Apr 2024 11:49:17 +0200 Subject: [PATCH 104/448] add simplified segmented range feature --- core/base/segmented_range.hpp | 348 +++++++++++++++++++++++++++++ core/test/base/CMakeLists.txt | 1 + core/test/base/segmented_range.cpp | 84 +++++++ 3 files changed, 433 insertions(+) create mode 100644 core/base/segmented_range.hpp create mode 100644 core/test/base/segmented_range.cpp diff --git a/core/base/segmented_range.hpp b/core/base/segmented_range.hpp new file mode 100644 index 00000000000..afe04b60b69 --- /dev/null +++ b/core/base/segmented_range.hpp @@ -0,0 +1,348 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_CORE_BASE_SEGMENTED_RANGE_HPP_ +#define GKO_CORE_BASE_SEGMENTED_RANGE_HPP_ + + +#include +#include + + +#include "core/base/index_range.hpp" +#include "core/base/iterator_factory.hpp" + + +namespace gko { + + +/** + * Represents a range of indices that is segmented into contiguous segments. + * Each segment has the shape `[begin, end)`, i.e. it is a half-open interval. + * + * @tparam IndexType the type of indices used to represent the segments. + */ +template +class segmented_range { +public: + using index_type = IndexType; + using index_iterator = index_iterator; + using segment = irange; + + /** + * An iterator pointing to (or past) a single segment in the range. + */ + class iterator { + public: + constexpr explicit iterator(segmented_range range, index_type segment) + : range_{range}, segment_{segment} + {} + + constexpr segment operator*() const + { + assert(segment_ >= 0); + assert(segment_ < range_.num_segments()); + return segment{range_.begin_index(segment_), + range_.end_index(segment_)}; + } + + constexpr iterator& operator++() + { + ++segment_; + return *this; + } + + constexpr friend bool operator==(iterator lhs, iterator rhs) + { + assert(lhs.range_ == rhs.range_); + return lhs.segment_ == rhs.segment_; + } + + constexpr friend bool operator!=(iterator lhs, iterator rhs) + { + return !(lhs == rhs); + } + + private: + segmented_range range_; + index_type segment_; + }; + + /** + * Constructs a segmented range from separate begin and end pointers. + * The `i`th range is given by `[begins[i], ends[i])`. + * + * @param begins a pointer to the array of beginning indices + * @param ends a pointer to the array of end indices + * @param num_segments the number of segments, i.e. the size of the + * beginning and end index arrays. + */ + constexpr explicit segmented_range(const index_type* begins, + const index_type* ends, + index_type num_segments) + : begins_{begins}, ends_{ends}, num_segments_{num_segments} + { + assert(num_segments_ >= 0); + } + + /** + * Constructs a segmented range from combined begin and end pointers. + * The `i`th range is given by `[ptrs[i], ptrs[i + 1])`. + * + * @param ptrs a pointer to the array of beginning and end indices + * @param num_segments the number of segments, i.e. the size of the + * ptrs index arrays. + */ + constexpr explicit segmented_range(const index_type* ptrs, + index_type num_segments) + : segmented_range{ptrs, ptrs + 1, num_segments} + {} + + /** + * Returns the segment at a given index. + * + * @param segment the index to access. It must be in `[0, num_segments())`. + * @return the segment at this index. + */ + constexpr segment operator[](index_type segment) const + { + assert(segment >= 0); + assert(segment < num_segments()); + return *iterator{*this, segment}; + } + + /** @return the number of segments in this range. */ + constexpr index_type num_segments() const { return num_segments_; } + + /** @return iterator pointing to the first segment. */ + constexpr iterator begin() const { return iterator{*this, 0}; } + + /** @return iterator pointing one past the last segment. */ + constexpr iterator end() const { return iterator{*this, num_segments()}; } + + /** @return iterator pointing to the first segment. */ + constexpr const index_type* begin_indices() const { return begins_; } + + /** @return iterator pointing one past the last segment. */ + constexpr const index_type* end_indices() const { return ends_; } + + /** @return the beginning index of the given segment. */ + constexpr index_type begin_index(index_type segment) const + { + assert(segment >= 0); + assert(segment < num_segments()); + return begin_indices()[segment]; + } + + /** @return the end index of the given segment. */ + constexpr index_type end_index(index_type segment) const + { + assert(segment >= 0); + assert(segment < num_segments()); + return end_indices()[segment]; + } + + /** Compares two ranges for equality. */ + constexpr friend bool operator==(segmented_range lhs, segmented_range rhs) + { + return lhs.begin_indices() == rhs.begin_indices() && + lhs.end_indices() == rhs.end_indices() && + lhs.num_segments() == rhs.num_segments(); + } + + /** Compares two ranges for inequality. */ + constexpr friend bool operator!=(segmented_range lhs, segmented_range rhs) + { + return !(lhs == rhs); + } + +private: + const index_type* begins_; + const index_type* ends_; + index_type num_segments_; +}; + + +/** + * Represents a range of indices that is segmented into contiguous segments, + * mapped into a value array. Each segment has the shape `[begin, end)`, i.e. it + * is a half-open interval and points to corresponding entries of the value + * array. + * + * @tparam IndexType the type of indices used to represent the segments. + * @tparam ValueIterator the iterator type pointing to the values. + */ +template +class segmented_value_range { +public: + using index_type = IndexType; + using index_iterator = index_iterator; + using value_iterator = ValueIterator; + using segment = iterator_range; + using enumerated_range = segmented_value_range< + index_type, detail::zip_iterator>; + + /** + * An iterator pointing to (or past) a single segment in the range. + */ + class iterator { + public: + constexpr explicit iterator(segmented_value_range range, + index_type segment) + : range_{range}, segment_{segment} + {} + + constexpr segment operator*() const + { + assert(segment_ >= 0); + assert(segment_ < range_.num_segments()); + return segment{range_.values() + range_.begin_index(segment_), + range_.values() + range_.end_index(segment_)}; + } + + constexpr iterator& operator++() + { + ++segment_; + return *this; + } + + constexpr friend bool operator==(iterator lhs, iterator rhs) + { + assert(lhs.range_ == rhs.range_); + return lhs.segment_ == rhs.segment_; + } + + constexpr friend bool operator!=(iterator lhs, iterator rhs) + { + return !(lhs == rhs); + } + + private: + segmented_value_range range_; + index_type segment_; + }; + + /** + * Constructs a segmented values range from separate begin and end pointers. + * The `i`th range is given by `[begins[i], ends[i])`. + * + * @param begins a pointer to the array of beginning indices + * @param ends a pointer to the array of end indices + * @param values an iterator pointing to the values into which the + * beginning/end indices point. + * @param num_segments the number of segments, i.e. the size of the + * beginning and end index arrays. + */ + constexpr explicit segmented_value_range(const index_type* begins, + const index_type* ends, + value_iterator values, + index_type num_segments) + : begins_{begins}, + ends_{ends}, + values_{values}, + num_segments_{num_segments} + { + assert(num_segments_ >= 0); + } + + /** + * Constructs a segmented range from combined begin and end pointers. + * The `i`th range is given by `[ptrs[i], ptrs[i + 1])`. + * + * @param ptrs a pointer to the array of beginning and end indices + * @param values an iterator pointing to the values into which the + * beginning/end indices point. + * @param num_segments the number of segments, i.e. the size of the + * ptrs index arrays. + */ + constexpr explicit segmented_value_range(const index_type* ptrs, + value_iterator values, + index_type num_segments) + : segmented_value_range{ptrs, ptrs + 1, values, num_segments} + {} + + /** + * Returns the segment at a given index. + * + * @param segment the index to access. It must be in `[0, num_segments())`. + * @return the segment at this index. + */ + constexpr segment operator[](index_type segment) const + { + assert(segment >= 0); + assert(segment < num_segments()); + return *iterator{*this, segment}; + } + + /** @return the number of segments in this range. */ + constexpr index_type num_segments() const { return num_segments_; } + + constexpr enumerated_range enumerated() const + { + return enumerated_range{ + begin_indices(), end_indices(), + detail::make_zip_iterator(index_iterator{0}, values()), + num_segments()}; + } + + /** @return iterator pointing to the first segment. */ + constexpr iterator begin() const { return iterator{*this, 0}; } + + /** @return iterator pointing one past the last segment. */ + constexpr iterator end() const { return iterator{*this, num_segments()}; } + + /** @return iterator pointing to the first segment. */ + constexpr const index_type* begin_indices() const { return begins_; } + + /** @return iterator pointing one past the last segment. */ + constexpr const index_type* end_indices() const { return ends_; } + + /** @return the beginning index of the given segment. */ + constexpr index_type begin_index(index_type segment) const + { + assert(segment >= 0); + assert(segment < num_segments()); + return begin_indices()[segment]; + } + + /** @return the end index of the given segment. */ + constexpr index_type end_index(index_type segment) const + { + assert(segment >= 0); + assert(segment < num_segments()); + return end_indices()[segment]; + } + + /** @return the value iterator. */ + constexpr value_iterator values() const { return values_; } + + /** Compares two ranges for equality. */ + constexpr friend bool operator==(segmented_value_range lhs, + segmented_value_range rhs) + { + return lhs.begin_indices() == rhs.begin_indices() && + lhs.end_indices() == rhs.end_indices() && + lhs.values() == rhs.values() && + lhs.num_segments() == rhs.num_segments(); + } + + /** Compares two ranges for inequality. */ + constexpr friend bool operator!=(segmented_value_range lhs, + segmented_value_range rhs) + { + return !(lhs == rhs); + } + +private: + const index_type* begins_; + const index_type* ends_; + value_iterator values_; + index_type num_segments_; +}; + + +} // namespace gko + + +#endif // GKO_CORE_BASE_SEGMENTED_RANGE_HPP_ diff --git a/core/test/base/CMakeLists.txt b/core/test/base/CMakeLists.txt index c608acd9a8a..d7deeec6fb7 100644 --- a/core/test/base/CMakeLists.txt +++ b/core/test/base/CMakeLists.txt @@ -27,6 +27,7 @@ ginkgo_create_test(range) ginkgo_create_test(range_accessors) ginkgo_create_test(sanitizers ADDITIONAL_LIBRARIES Threads::Threads) ginkgo_create_test(segmented_array) +ginkgo_create_test(segmented_range) ginkgo_create_test(types) ginkgo_create_test(utils) ginkgo_create_test(version EXECUTABLE_NAME version_test) # version collides with C++ stdlib header diff --git a/core/test/base/segmented_range.cpp b/core/test/base/segmented_range.cpp new file mode 100644 index 00000000000..33c2941d4dd --- /dev/null +++ b/core/test/base/segmented_range.cpp @@ -0,0 +1,84 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include +#include + + +#include + + +#include "core/base/segmented_range.hpp" + + +TEST(SegmentedRange, Works) +{ + std::vector begins{3, 1, 4, 9}; + std::vector ends{3, 10, 6, 10}; + std::vector> result_indices(begins.size()); + gko::segmented_range range{begins.data(), ends.data(), + static_cast(begins.size())}; + + for (auto row : gko::irange(begins.size())) { + for (auto nz : range[row]) { + result_indices[row].push_back(nz); + } + } + + ASSERT_EQ(result_indices, + std::vector>( + {{}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {4, 5}, {9}})); +} + + +TEST(SegmentedValueRange, Works) +{ + std::vector begins{3, 1, 4, 9}; + std::vector ends{3, 10, 6, 10}; + std::vector values(ends.back()); + std::iota(values.begin(), values.end(), 1); + std::vector> result_values(begins.size()); + gko::segmented_value_range::iterator> range{ + begins.data(), ends.data(), values.begin(), + static_cast(begins.size())}; + + for (auto row : gko::irange(begins.size())) { + for (auto nz : range[row]) { + result_values[row].push_back(nz); + } + } + + ASSERT_EQ(result_values, + std::vector>( + {{}, {2, 3, 4, 5, 6, 7, 8, 9, 10}, {5, 6}, {10}})); +} + + +TEST(SegmentedEnumeratedValueRange, Works) +{ + std::vector begins{3, 1, 4, 9}; + std::vector ends{3, 10, 6, 10}; + std::vector values(ends.back()); + std::iota(values.begin(), values.end(), 1); + std::vector> result_values(begins.size()); + std::vector> result_indices(begins.size()); + gko::segmented_value_range::iterator> range{ + begins.data(), ends.data(), values.begin(), + static_cast(begins.size())}; + + for (auto row : gko::irange(begins.size())) { + for (auto tuple : range.enumerated()[row]) { + result_indices[row].push_back(std::get<0>(tuple)); + result_values[row].push_back(std::get<1>(tuple)); + } + } + + ASSERT_EQ(result_indices, + std::vector>( + {{}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {4, 5}, {9}})); + ASSERT_EQ(result_values, + std::vector>( + {{}, {2, 3, 4, 5, 6, 7, 8, 9, 10}, {5, 6}, {10}})); +} From 11eded47944d476256157e7b6fb5fdfbb668e6fb Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 29 Apr 2024 11:49:31 +0200 Subject: [PATCH 105/448] add structured binding support --- core/base/segmented_range.hpp | 29 ++++++++++---- core/test/base/segmented_range.cpp | 61 ++++++++++++++++++++++++++++-- 2 files changed, 78 insertions(+), 12 deletions(-) diff --git a/core/base/segmented_range.hpp b/core/base/segmented_range.hpp index afe04b60b69..344de0aa623 100644 --- a/core/base/segmented_range.hpp +++ b/core/base/segmented_range.hpp @@ -39,12 +39,18 @@ class segmented_range { : range_{range}, segment_{segment} {} - constexpr segment operator*() const + struct enumerated_segment { + index_type index; + segment segment; + }; + + constexpr enumerated_segment operator*() const { assert(segment_ >= 0); assert(segment_ < range_.num_segments()); - return segment{range_.begin_index(segment_), - range_.end_index(segment_)}; + return enumerated_segment{segment_, + segment{range_.begin_index(segment_), + range_.end_index(segment_)}}; } constexpr iterator& operator++() @@ -109,7 +115,7 @@ class segmented_range { { assert(segment >= 0); assert(segment < num_segments()); - return *iterator{*this, segment}; + return (*iterator{*this, segment}).segment; } /** @return the number of segments in this range. */ @@ -193,12 +199,19 @@ class segmented_value_range { : range_{range}, segment_{segment} {} - constexpr segment operator*() const + struct enumerated_segment { + index_type index; + segment segment; + }; + + constexpr enumerated_segment operator*() const { assert(segment_ >= 0); assert(segment_ < range_.num_segments()); - return segment{range_.values() + range_.begin_index(segment_), - range_.values() + range_.end_index(segment_)}; + return enumerated_segment{ + segment_, + segment{range_.values() + range_.begin_index(segment_), + range_.values() + range_.end_index(segment_)}}; } constexpr iterator& operator++() @@ -272,7 +285,7 @@ class segmented_value_range { { assert(segment >= 0); assert(segment < num_segments()); - return *iterator{*this, segment}; + return (*iterator{*this, segment}).segment; } /** @return the number of segments in this range. */ diff --git a/core/test/base/segmented_range.cpp b/core/test/base/segmented_range.cpp index 33c2941d4dd..5de04c3035a 100644 --- a/core/test/base/segmented_range.cpp +++ b/core/test/base/segmented_range.cpp @@ -33,7 +33,7 @@ TEST(SegmentedRange, Works) } -TEST(SegmentedValueRange, Works) +TEST(SegmentedValueRange, WorksByIndex) { std::vector begins{3, 1, 4, 9}; std::vector ends{3, 10, 6, 10}; @@ -56,8 +56,32 @@ TEST(SegmentedValueRange, Works) } -TEST(SegmentedEnumeratedValueRange, Works) +TEST(SegmentedValueRange, WorksByRangeFor) { + std::vector begins{3, 1, 4, 9}; + std::vector ends{3, 10, 6, 10}; + std::vector values(ends.back()); + std::iota(values.begin(), values.end(), 1); + std::vector> result_values(begins.size()); + gko::segmented_value_range::iterator> range{ + begins.data(), ends.data(), values.begin(), + static_cast(begins.size())}; + + for (auto [row, segment] : range) { + for (auto nz : segment) { + result_values[row].push_back(nz); + } + } + + ASSERT_EQ(result_values, + std::vector>( + {{}, {2, 3, 4, 5, 6, 7, 8, 9, 10}, {5, 6}, {10}})); +} + + +TEST(SegmentedEnumeratedValueRange, WorksByIndex) +{ + using gko::get; std::vector begins{3, 1, 4, 9}; std::vector ends{3, 10, 6, 10}; std::vector values(ends.back()); @@ -70,8 +94,37 @@ TEST(SegmentedEnumeratedValueRange, Works) for (auto row : gko::irange(begins.size())) { for (auto tuple : range.enumerated()[row]) { - result_indices[row].push_back(std::get<0>(tuple)); - result_values[row].push_back(std::get<1>(tuple)); + result_indices[row].push_back(get<0>(tuple)); + result_values[row].push_back(get<1>(tuple)); + } + } + + ASSERT_EQ(result_indices, + std::vector>( + {{}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {4, 5}, {9}})); + ASSERT_EQ(result_values, + std::vector>( + {{}, {2, 3, 4, 5, 6, 7, 8, 9, 10}, {5, 6}, {10}})); +} + + +TEST(SegmentedEnumeratedValueRange, WorksByRangeFor) +{ + std::vector begins{3, 1, 4, 9}; + std::vector ends{3, 10, 6, 10}; + std::vector values(ends.back()); + std::iota(values.begin(), values.end(), 1); + std::vector> result_values(begins.size()); + std::vector> result_indices(begins.size()); + gko::segmented_value_range::iterator> range{ + begins.data(), ends.data(), values.begin(), + static_cast(begins.size())}; + auto enumerated_range = range.enumerated(); + + for (auto [row, segment] : enumerated_range) { + for (auto [index, value] : segment) { + result_indices[row].push_back(index); + result_values[row].push_back(value); } } From 4b1be60faccb68c78f2cf226414486a7bdeb2f8a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 1 May 2024 13:28:36 +0200 Subject: [PATCH 106/448] review updates - fix name hiding in classes - add tests for ptrs constructors Co-authored-by: Marcel Koch --- core/base/segmented_range.hpp | 38 ++++++++++++------ core/test/base/segmented_range.cpp | 64 ++++++++++++++++++++++++++++-- 2 files changed, 86 insertions(+), 16 deletions(-) diff --git a/core/base/segmented_range.hpp b/core/base/segmented_range.hpp index 344de0aa623..3820dbfb06f 100644 --- a/core/base/segmented_range.hpp +++ b/core/base/segmented_range.hpp @@ -27,8 +27,8 @@ template class segmented_range { public: using index_type = IndexType; - using index_iterator = index_iterator; - using segment = irange; + using index_iterator_type = index_iterator; + using segment_type = irange; /** * An iterator pointing to (or past) a single segment in the range. @@ -41,7 +41,7 @@ class segmented_range { struct enumerated_segment { index_type index; - segment segment; + segment_type segment; }; constexpr enumerated_segment operator*() const @@ -49,8 +49,8 @@ class segmented_range { assert(segment_ >= 0); assert(segment_ < range_.num_segments()); return enumerated_segment{segment_, - segment{range_.begin_index(segment_), - range_.end_index(segment_)}}; + segment_type{range_.begin_index(segment_), + range_.end_index(segment_)}}; } constexpr iterator& operator++() @@ -111,7 +111,7 @@ class segmented_range { * @param segment the index to access. It must be in `[0, num_segments())`. * @return the segment at this index. */ - constexpr segment operator[](index_type segment) const + constexpr segment_type operator[](index_type segment) const { assert(segment >= 0); assert(segment < num_segments()); @@ -121,6 +121,12 @@ class segmented_range { /** @return the number of segments in this range. */ constexpr index_type num_segments() const { return num_segments_; } + /** @return an index range representing all segment indices. */ + constexpr irange segment_indices() const + { + return irange{num_segments()}; + } + /** @return iterator pointing to the first segment. */ constexpr iterator begin() const { return iterator{*this, 0}; } @@ -183,11 +189,11 @@ template class segmented_value_range { public: using index_type = IndexType; - using index_iterator = index_iterator; + using index_iterator_type = index_iterator; using value_iterator = ValueIterator; - using segment = iterator_range; + using segment_type = iterator_range; using enumerated_range = segmented_value_range< - index_type, detail::zip_iterator>; + index_type, detail::zip_iterator>; /** * An iterator pointing to (or past) a single segment in the range. @@ -201,7 +207,7 @@ class segmented_value_range { struct enumerated_segment { index_type index; - segment segment; + segment_type segment; }; constexpr enumerated_segment operator*() const @@ -210,8 +216,8 @@ class segmented_value_range { assert(segment_ < range_.num_segments()); return enumerated_segment{ segment_, - segment{range_.values() + range_.begin_index(segment_), - range_.values() + range_.end_index(segment_)}}; + segment_type{range_.values() + range_.begin_index(segment_), + range_.values() + range_.end_index(segment_)}}; } constexpr iterator& operator++() @@ -281,7 +287,7 @@ class segmented_value_range { * @param segment the index to access. It must be in `[0, num_segments())`. * @return the segment at this index. */ - constexpr segment operator[](index_type segment) const + constexpr segment_type operator[](index_type segment) const { assert(segment >= 0); assert(segment < num_segments()); @@ -291,6 +297,12 @@ class segmented_value_range { /** @return the number of segments in this range. */ constexpr index_type num_segments() const { return num_segments_; } + /** @return an index range representing all segment indices. */ + constexpr irange segment_indices() const + { + return irange{num_segments()}; + } + constexpr enumerated_range enumerated() const { return enumerated_range{ diff --git a/core/test/base/segmented_range.cpp b/core/test/base/segmented_range.cpp index 5de04c3035a..76f5ae8ffcc 100644 --- a/core/test/base/segmented_range.cpp +++ b/core/test/base/segmented_range.cpp @@ -13,7 +13,7 @@ #include "core/base/segmented_range.hpp" -TEST(SegmentedRange, Works) +TEST(SegmentedRange, WorksByIndex) { std::vector begins{3, 1, 4, 9}; std::vector ends{3, 10, 6, 10}; @@ -21,7 +21,7 @@ TEST(SegmentedRange, Works) gko::segmented_range range{begins.data(), ends.data(), static_cast(begins.size())}; - for (auto row : gko::irange(begins.size())) { + for (auto row : range.segment_indices()) { for (auto nz : range[row]) { result_indices[row].push_back(nz); } @@ -33,6 +33,44 @@ TEST(SegmentedRange, Works) } +TEST(SegmentedRange, WorksByRangeFor) +{ + std::vector begins{3, 1, 4, 9}; + std::vector ends{3, 10, 6, 10}; + std::vector> result_indices(begins.size()); + gko::segmented_range range{begins.data(), ends.data(), + static_cast(begins.size())}; + + for (auto [row, segment] : range) { + for (auto nz : segment) { + result_indices[row].push_back(nz); + } + } + + ASSERT_EQ(result_indices, + std::vector>( + {{}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {4, 5}, {9}})); +} + + +TEST(SegmentedRange, WorksWithPtrsConstructor) +{ + std::vector ptrs{0, 2, 4, 5, 9}; + std::vector> result_indices(ptrs.size() - 1); + gko::segmented_range range{ptrs.data(), + static_cast(ptrs.size() - 1)}; + + for (auto row : range.segment_indices()) { + for (auto nz : range[row]) { + result_indices[row].push_back(nz); + } + } + + ASSERT_EQ(result_indices, std::vector>( + {{0, 1}, {2, 3}, {4}, {5, 6, 7, 8}})); +} + + TEST(SegmentedValueRange, WorksByIndex) { std::vector begins{3, 1, 4, 9}; @@ -79,6 +117,26 @@ TEST(SegmentedValueRange, WorksByRangeFor) } +TEST(SegmentedValueRange, WorksWithPtrsConstructor) +{ + std::vector ptrs{0, 2, 4, 5, 9}; + std::vector values(ptrs.back()); + std::iota(values.begin(), values.end(), 1); + std::vector> result_values(ptrs.size() - 1); + gko::segmented_value_range::iterator> range{ + ptrs.data(), values.begin(), static_cast(ptrs.size() - 1)}; + + for (auto row : range.segment_indices()) { + for (auto nz : range[row]) { + result_values[row].push_back(nz); + } + } + + ASSERT_EQ(result_values, std::vector>( + {{1, 2}, {3, 4}, {5}, {6, 7, 8, 9}})); +} + + TEST(SegmentedEnumeratedValueRange, WorksByIndex) { using gko::get; @@ -92,7 +150,7 @@ TEST(SegmentedEnumeratedValueRange, WorksByIndex) begins.data(), ends.data(), values.begin(), static_cast(begins.size())}; - for (auto row : gko::irange(begins.size())) { + for (auto row : range.segment_indices()) { for (auto tuple : range.enumerated()[row]) { result_indices[row].push_back(get<0>(tuple)); result_values[row].push_back(get<1>(tuple)); From 6b2f01997d3ee7eace6bd149f4e651f37f5a307b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 1 May 2024 13:33:22 +0200 Subject: [PATCH 107/448] add documentation --- core/base/iterator_factory.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp index de5af49e24f..54e7fecb94e 100644 --- a/core/base/iterator_factory.hpp +++ b/core/base/iterator_factory.hpp @@ -710,6 +710,7 @@ permute_iterator make_permute_iterator( } // namespace detail +/** std::get reimplementation for device_tuple. */ template constexpr typename std::tuple_element>::type& get(detail::device_tuple& tuple) @@ -718,6 +719,7 @@ get(detail::device_tuple& tuple) } +/** std::get reimplementation for const device_tuple. */ template constexpr const typename std::tuple_element>::type& From e1de865f01c4f7f0194049d8a6c4cdd90c61dcc6 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 1 May 2024 13:51:01 +0200 Subject: [PATCH 108/448] add assertion tests for segmented ranges --- core/test/base/segmented_range.cpp | 62 ++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/core/test/base/segmented_range.cpp b/core/test/base/segmented_range.cpp index 76f5ae8ffcc..63079bc2e3f 100644 --- a/core/test/base/segmented_range.cpp +++ b/core/test/base/segmented_range.cpp @@ -193,3 +193,65 @@ TEST(SegmentedEnumeratedValueRange, WorksByRangeFor) std::vector>( {{}, {2, 3, 4, 5, 6, 7, 8, 9, 10}, {5, 6}, {10}})); } + + +#ifndef NDEBUG + + +bool check_assertion_exit_code(int exit_code) +{ +#ifdef _MSC_VER + // MSVC picks up the exit code incorrectly, + // so we can only check that it exits + return true; +#else + return exit_code != 0; +#endif +} + + +TEST(DeathTest, Assertions) +{ + using range_t = gko::segmented_range; + using vrange_t = gko::segmented_value_range; + using range_it_t = range_t::iterator; + using vrange_it_t = vrange_t::iterator; + std::vector ptrs{0, 1}; + std::vector values{0, 1}; + range_t range{ptrs.data(), static_cast(ptrs.size() - 1)}; + range_t range2{ptrs.data(), 0}; + vrange_t vrange{ptrs.data(), values.data(), + static_cast(ptrs.size() - 1)}; + vrange_t vrange2{ptrs.data(), values.data(), 0}; + // gko::segmented_range::iterator + EXPECT_EXIT((void)*(range_it_t{range, -1}), check_assertion_exit_code, ""); + EXPECT_EXIT((void)*(range_it_t{range, 1}), check_assertion_exit_code, ""); + EXPECT_EXIT((void)(range_it_t{range, 0} == range_it_t{range2, 0}), + check_assertion_exit_code, ""); + // gko::segmented_range + EXPECT_EXIT((void)(range_t{nullptr, -1}), check_assertion_exit_code, ""); + EXPECT_EXIT((void)range[-1], check_assertion_exit_code, ""); + EXPECT_EXIT((void)range[1], check_assertion_exit_code, ""); + EXPECT_EXIT((void)range.begin_index(-1), check_assertion_exit_code, ""); + EXPECT_EXIT((void)range.begin_index(1), check_assertion_exit_code, ""); + EXPECT_EXIT((void)range.end_index(-1), check_assertion_exit_code, ""); + EXPECT_EXIT((void)range.end_index(1), check_assertion_exit_code, ""); + // gko::segmented_value_range::iterator + EXPECT_EXIT((void)*(vrange_it_t{vrange, -1}), check_assertion_exit_code, + ""); + EXPECT_EXIT((void)*(vrange_it_t{vrange, 1}), check_assertion_exit_code, ""); + EXPECT_EXIT((void)(vrange_it_t{vrange, 0} == vrange_it_t{vrange2, 0}), + check_assertion_exit_code, ""); + // gko::segmented_value_range + EXPECT_EXIT((void)(vrange_t{nullptr, nullptr, -1}), + check_assertion_exit_code, ""); + EXPECT_EXIT((void)vrange[-1], check_assertion_exit_code, ""); + EXPECT_EXIT((void)vrange[1], check_assertion_exit_code, ""); + EXPECT_EXIT((void)vrange.begin_index(-1), check_assertion_exit_code, ""); + EXPECT_EXIT((void)vrange.begin_index(1), check_assertion_exit_code, ""); + EXPECT_EXIT((void)vrange.end_index(-1), check_assertion_exit_code, ""); + EXPECT_EXIT((void)vrange.end_index(1), check_assertion_exit_code, ""); +} + + +#endif From bc09d636a67e6634b220ccd74cfc3f86ec16a1dc Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 1 May 2024 14:17:58 +0200 Subject: [PATCH 109/448] test segmented ranges on the device --- test/base/CMakeLists.txt | 1 + test/base/segmented_range.cpp | 69 +++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 test/base/segmented_range.cpp diff --git a/test/base/CMakeLists.txt b/test/base/CMakeLists.txt index 5f31c25db19..bc2ea73620f 100644 --- a/test/base/CMakeLists.txt +++ b/test/base/CMakeLists.txt @@ -4,4 +4,5 @@ ginkgo_create_common_device_test(index_range) ginkgo_create_common_device_test(iterator_factory) ginkgo_create_common_device_test(kernel_launch_generic) ginkgo_create_common_and_reference_test(executor) +ginkgo_create_common_device_test(segmented_range) ginkgo_create_common_and_reference_test(timer) diff --git a/test/base/segmented_range.cpp b/test/base/segmented_range.cpp new file mode 100644 index 00000000000..54c491a8493 --- /dev/null +++ b/test/base/segmented_range.cpp @@ -0,0 +1,69 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include + + +#include + + +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "core/base/segmented_range.hpp" +#include "core/test/utils.hpp" +#include "test/utils/executor.hpp" + + +class SegmentedRange : public CommonTestFixture { +public: + SegmentedRange() + : ptrs{exec, {0, 0, 1, 3, 4, 9}}, + values{exec, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, + output{exec, 2 * values.get_size()} + {} + + gko::array ptrs; + gko::array values; + gko::array output; +}; + + +// nvcc doesn't like device lambdas declared in complex classes, move it out +void run_segmented_range(std::shared_ptr exec, + const gko::array& ptrs, + const gko::array& values, gko::array& output) +{ + gko::kernels::EXEC_NAMESPACE::run_kernel( + exec, + [] GKO_KERNEL(auto i, auto ptrs, auto values, auto output, auto size) { + gko::segmented_range range{ptrs, size}; + for (auto [row, segment] : range) { + for (auto nz : segment) { + output[nz] = row; + } + } + auto num_values = ptrs[size]; + gko::segmented_value_range vrange{ptrs, values, + size}; + for (auto [row, segment] : vrange.enumerated()) { + for (auto [nz, value] : segment) { + output[nz + num_values] = row * 10 + value; + } + } + }, + 1, ptrs, values, output, static_cast(ptrs.get_size() - 1)); +} + + +TEST_F(SegmentedRange, KernelRunsSegmentedRange) +{ + gko::array expected{ + ref, {1, 2, 2, 3, 4, 4, 4, 4, 4, 11, 22, 23, 34, 45, 46, 47, 48, 49}}; + + run_segmented_range(exec, ptrs, values, output); + + GKO_ASSERT_ARRAY_EQ(output, expected); +} From cf29d4bec39e2f0c4046357a7c88d879c1ea4ff6 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 1 May 2024 18:29:45 +0200 Subject: [PATCH 110/448] work around nvcc issue --- core/base/iterator_factory.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp index 54e7fecb94e..dbd921d0762 100644 --- a/core/base/iterator_factory.hpp +++ b/core/base/iterator_factory.hpp @@ -288,10 +288,7 @@ class zip_iterator_reference template constexpr value_type cast_impl(std::index_sequence) const { - // gcc 5 throws error as using uninitialized array - // std::tuple t = { 1, '2' }; is not allowed. - // converting to 'std::tuple<...>' from initializer list would use - // explicit constructor + // need to use fully qualified name for nvcc 11.x to not call this->get return value_type(gko::get(*this)...); } @@ -299,6 +296,7 @@ class zip_iterator_reference constexpr void assign_impl(std::index_sequence, const value_type& other) { + // need to use fully qualified name for nvcc 11.x to not call this->get (void)std::initializer_list{ (gko::get(*this) = gko::get(other), 0)...}; } From c641cd2cc6908980316bb423ae34e06cbdc2dbab Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 9 Jul 2024 00:13:49 +0200 Subject: [PATCH 111/448] formatting --- core/base/segmented_range.hpp | 1 - core/test/base/segmented_range.cpp | 6 ++---- test/base/segmented_range.cpp | 6 ++---- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/core/base/segmented_range.hpp b/core/base/segmented_range.hpp index 3820dbfb06f..9bab7e457d6 100644 --- a/core/base/segmented_range.hpp +++ b/core/base/segmented_range.hpp @@ -9,7 +9,6 @@ #include #include - #include "core/base/index_range.hpp" #include "core/base/iterator_factory.hpp" diff --git a/core/test/base/segmented_range.cpp b/core/test/base/segmented_range.cpp index 63079bc2e3f..6067ab13ca2 100644 --- a/core/test/base/segmented_range.cpp +++ b/core/test/base/segmented_range.cpp @@ -2,17 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/base/segmented_range.hpp" + #include #include #include - #include -#include "core/base/segmented_range.hpp" - - TEST(SegmentedRange, WorksByIndex) { std::vector begins{3, 1, 4, 9}; diff --git a/test/base/segmented_range.cpp b/test/base/segmented_range.cpp index 54c491a8493..436a6fb8a55 100644 --- a/test/base/segmented_range.cpp +++ b/test/base/segmented_range.cpp @@ -2,17 +2,15 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include "core/base/segmented_range.hpp" +#include #include - #include - #include "common/unified/base/kernel_launch.hpp" -#include "core/base/segmented_range.hpp" #include "core/test/utils.hpp" #include "test/utils/executor.hpp" From bca42c2a8011becc2a3f14e692eccfc8cb0ddc91 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 1 Aug 2024 10:43:05 +0200 Subject: [PATCH 112/448] rename segmented_range to segmented_index_range --- core/base/segmented_range.hpp | 25 ++++++++++++++----------- core/test/base/segmented_range.cpp | 18 +++++++++--------- test/base/segmented_range.cpp | 15 ++++++++------- 3 files changed, 31 insertions(+), 27 deletions(-) diff --git a/core/base/segmented_range.hpp b/core/base/segmented_range.hpp index 9bab7e457d6..d3ec8e1da73 100644 --- a/core/base/segmented_range.hpp +++ b/core/base/segmented_range.hpp @@ -23,7 +23,7 @@ namespace gko { * @tparam IndexType the type of indices used to represent the segments. */ template -class segmented_range { +class segmented_index_range { public: using index_type = IndexType; using index_iterator_type = index_iterator; @@ -34,7 +34,8 @@ class segmented_range { */ class iterator { public: - constexpr explicit iterator(segmented_range range, index_type segment) + constexpr explicit iterator(segmented_index_range range, + index_type segment) : range_{range}, segment_{segment} {} @@ -70,7 +71,7 @@ class segmented_range { } private: - segmented_range range_; + segmented_index_range range_; index_type segment_; }; @@ -83,9 +84,9 @@ class segmented_range { * @param num_segments the number of segments, i.e. the size of the * beginning and end index arrays. */ - constexpr explicit segmented_range(const index_type* begins, - const index_type* ends, - index_type num_segments) + constexpr explicit segmented_index_range(const index_type* begins, + const index_type* ends, + index_type num_segments) : begins_{begins}, ends_{ends}, num_segments_{num_segments} { assert(num_segments_ >= 0); @@ -99,9 +100,9 @@ class segmented_range { * @param num_segments the number of segments, i.e. the size of the * ptrs index arrays. */ - constexpr explicit segmented_range(const index_type* ptrs, - index_type num_segments) - : segmented_range{ptrs, ptrs + 1, num_segments} + constexpr explicit segmented_index_range(const index_type* ptrs, + index_type num_segments) + : segmented_index_range{ptrs, ptrs + 1, num_segments} {} /** @@ -155,7 +156,8 @@ class segmented_range { } /** Compares two ranges for equality. */ - constexpr friend bool operator==(segmented_range lhs, segmented_range rhs) + constexpr friend bool operator==(segmented_index_range lhs, + segmented_index_range rhs) { return lhs.begin_indices() == rhs.begin_indices() && lhs.end_indices() == rhs.end_indices() && @@ -163,7 +165,8 @@ class segmented_range { } /** Compares two ranges for inequality. */ - constexpr friend bool operator!=(segmented_range lhs, segmented_range rhs) + constexpr friend bool operator!=(segmented_index_range lhs, + segmented_index_range rhs) { return !(lhs == rhs); } diff --git a/core/test/base/segmented_range.cpp b/core/test/base/segmented_range.cpp index 6067ab13ca2..b10b17f7e1b 100644 --- a/core/test/base/segmented_range.cpp +++ b/core/test/base/segmented_range.cpp @@ -16,8 +16,8 @@ TEST(SegmentedRange, WorksByIndex) std::vector begins{3, 1, 4, 9}; std::vector ends{3, 10, 6, 10}; std::vector> result_indices(begins.size()); - gko::segmented_range range{begins.data(), ends.data(), - static_cast(begins.size())}; + gko::segmented_index_range range{begins.data(), ends.data(), + static_cast(begins.size())}; for (auto row : range.segment_indices()) { for (auto nz : range[row]) { @@ -36,8 +36,8 @@ TEST(SegmentedRange, WorksByRangeFor) std::vector begins{3, 1, 4, 9}; std::vector ends{3, 10, 6, 10}; std::vector> result_indices(begins.size()); - gko::segmented_range range{begins.data(), ends.data(), - static_cast(begins.size())}; + gko::segmented_index_range range{begins.data(), ends.data(), + static_cast(begins.size())}; for (auto [row, segment] : range) { for (auto nz : segment) { @@ -55,8 +55,8 @@ TEST(SegmentedRange, WorksWithPtrsConstructor) { std::vector ptrs{0, 2, 4, 5, 9}; std::vector> result_indices(ptrs.size() - 1); - gko::segmented_range range{ptrs.data(), - static_cast(ptrs.size() - 1)}; + gko::segmented_index_range range{ptrs.data(), + static_cast(ptrs.size() - 1)}; for (auto row : range.segment_indices()) { for (auto nz : range[row]) { @@ -210,7 +210,7 @@ bool check_assertion_exit_code(int exit_code) TEST(DeathTest, Assertions) { - using range_t = gko::segmented_range; + using range_t = gko::segmented_index_range; using vrange_t = gko::segmented_value_range; using range_it_t = range_t::iterator; using vrange_it_t = vrange_t::iterator; @@ -221,12 +221,12 @@ TEST(DeathTest, Assertions) vrange_t vrange{ptrs.data(), values.data(), static_cast(ptrs.size() - 1)}; vrange_t vrange2{ptrs.data(), values.data(), 0}; - // gko::segmented_range::iterator + // gko::segmented_index_range::iterator EXPECT_EXIT((void)*(range_it_t{range, -1}), check_assertion_exit_code, ""); EXPECT_EXIT((void)*(range_it_t{range, 1}), check_assertion_exit_code, ""); EXPECT_EXIT((void)(range_it_t{range, 0} == range_it_t{range2, 0}), check_assertion_exit_code, ""); - // gko::segmented_range + // gko::segmented_index_range EXPECT_EXIT((void)(range_t{nullptr, -1}), check_assertion_exit_code, ""); EXPECT_EXIT((void)range[-1], check_assertion_exit_code, ""); EXPECT_EXIT((void)range[1], check_assertion_exit_code, ""); diff --git a/test/base/segmented_range.cpp b/test/base/segmented_range.cpp index 436a6fb8a55..86dfc21eaa6 100644 --- a/test/base/segmented_range.cpp +++ b/test/base/segmented_range.cpp @@ -12,7 +12,7 @@ #include "common/unified/base/kernel_launch.hpp" #include "core/test/utils.hpp" -#include "test/utils/executor.hpp" +#include "test/utils/common_fixture.hpp" class SegmentedRange : public CommonTestFixture { @@ -30,14 +30,15 @@ class SegmentedRange : public CommonTestFixture { // nvcc doesn't like device lambdas declared in complex classes, move it out -void run_segmented_range(std::shared_ptr exec, - const gko::array& ptrs, - const gko::array& values, gko::array& output) +void run_segmented_index_range(std::shared_ptr exec, + const gko::array& ptrs, + const gko::array& values, + gko::array& output) { - gko::kernels::EXEC_NAMESPACE::run_kernel( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( exec, [] GKO_KERNEL(auto i, auto ptrs, auto values, auto output, auto size) { - gko::segmented_range range{ptrs, size}; + gko::segmented_index_range range{ptrs, size}; for (auto [row, segment] : range) { for (auto nz : segment) { output[nz] = row; @@ -61,7 +62,7 @@ TEST_F(SegmentedRange, KernelRunsSegmentedRange) gko::array expected{ ref, {1, 2, 2, 3, 4, 4, 4, 4, 4, 11, 22, 23, 34, 45, 46, 47, 48, 49}}; - run_segmented_range(exec, ptrs, values, output); + run_segmented_index_range(exec, ptrs, values, output); GKO_ASSERT_ARRAY_EQ(output, expected); } From 9cca4f7931db8e05ff8109edda687266f008a529 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 1 Aug 2024 14:30:42 +0200 Subject: [PATCH 113/448] use GKO_ASSERT in device code --- core/base/index_range.hpp | 4 +++- core/base/segmented_array.hpp | 2 +- core/base/segmented_range.hpp | 40 +++++++++++++++++------------------ 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/core/base/index_range.hpp b/core/base/index_range.hpp index ca972363b4a..0a9d1e109c6 100644 --- a/core/base/index_range.hpp +++ b/core/base/index_range.hpp @@ -10,6 +10,8 @@ #include #include +#include + #include "core/base/iterator_range.hpp" @@ -188,7 +190,7 @@ class irange : public iterator_range> { constexpr explicit irange(index_type begin, index_type end) : iterator_range{iterator{begin}, iterator{end}} { - assert(begin <= end); + GKO_ASSERT(begin <= end); } /** diff --git a/core/base/segmented_array.hpp b/core/base/segmented_array.hpp index 8999feddd01..ffa4d62e74a 100644 --- a/core/base/segmented_array.hpp +++ b/core/base/segmented_array.hpp @@ -31,7 +31,7 @@ struct device_segmented_array { constexpr segment get_segment(size_type segment_id) { - assert(segment_id < (offsets_end - offsets_begin)); + GKO_ASSERT(segment_id < (offsets_end - offsets_begin)); return {flat_begin + offsets_begin[segment_id], flat_begin + offsets_begin[segment_id + 1]}; } diff --git a/core/base/segmented_range.hpp b/core/base/segmented_range.hpp index d3ec8e1da73..546f7d62e18 100644 --- a/core/base/segmented_range.hpp +++ b/core/base/segmented_range.hpp @@ -46,8 +46,8 @@ class segmented_index_range { constexpr enumerated_segment operator*() const { - assert(segment_ >= 0); - assert(segment_ < range_.num_segments()); + GKO_ASSERT(segment_ >= 0); + GKO_ASSERT(segment_ < range_.num_segments()); return enumerated_segment{segment_, segment_type{range_.begin_index(segment_), range_.end_index(segment_)}}; @@ -61,7 +61,7 @@ class segmented_index_range { constexpr friend bool operator==(iterator lhs, iterator rhs) { - assert(lhs.range_ == rhs.range_); + GKO_ASSERT(lhs.range_ == rhs.range_); return lhs.segment_ == rhs.segment_; } @@ -89,7 +89,7 @@ class segmented_index_range { index_type num_segments) : begins_{begins}, ends_{ends}, num_segments_{num_segments} { - assert(num_segments_ >= 0); + GKO_ASSERT(num_segments_ >= 0); } /** @@ -113,8 +113,8 @@ class segmented_index_range { */ constexpr segment_type operator[](index_type segment) const { - assert(segment >= 0); - assert(segment < num_segments()); + GKO_ASSERT(segment >= 0); + GKO_ASSERT(segment < num_segments()); return (*iterator{*this, segment}).segment; } @@ -142,16 +142,16 @@ class segmented_index_range { /** @return the beginning index of the given segment. */ constexpr index_type begin_index(index_type segment) const { - assert(segment >= 0); - assert(segment < num_segments()); + GKO_ASSERT(segment >= 0); + GKO_ASSERT(segment < num_segments()); return begin_indices()[segment]; } /** @return the end index of the given segment. */ constexpr index_type end_index(index_type segment) const { - assert(segment >= 0); - assert(segment < num_segments()); + GKO_ASSERT(segment >= 0); + GKO_ASSERT(segment < num_segments()); return end_indices()[segment]; } @@ -214,8 +214,8 @@ class segmented_value_range { constexpr enumerated_segment operator*() const { - assert(segment_ >= 0); - assert(segment_ < range_.num_segments()); + GKO_ASSERT(segment_ >= 0); + GKO_ASSERT(segment_ < range_.num_segments()); return enumerated_segment{ segment_, segment_type{range_.values() + range_.begin_index(segment_), @@ -230,7 +230,7 @@ class segmented_value_range { constexpr friend bool operator==(iterator lhs, iterator rhs) { - assert(lhs.range_ == rhs.range_); + GKO_ASSERT(lhs.range_ == rhs.range_); return lhs.segment_ == rhs.segment_; } @@ -264,7 +264,7 @@ class segmented_value_range { values_{values}, num_segments_{num_segments} { - assert(num_segments_ >= 0); + GKO_ASSERT(num_segments_ >= 0); } /** @@ -291,8 +291,8 @@ class segmented_value_range { */ constexpr segment_type operator[](index_type segment) const { - assert(segment >= 0); - assert(segment < num_segments()); + GKO_ASSERT(segment >= 0); + GKO_ASSERT(segment < num_segments()); return (*iterator{*this, segment}).segment; } @@ -328,16 +328,16 @@ class segmented_value_range { /** @return the beginning index of the given segment. */ constexpr index_type begin_index(index_type segment) const { - assert(segment >= 0); - assert(segment < num_segments()); + GKO_ASSERT(segment >= 0); + GKO_ASSERT(segment < num_segments()); return begin_indices()[segment]; } /** @return the end index of the given segment. */ constexpr index_type end_index(index_type segment) const { - assert(segment >= 0); - assert(segment < num_segments()); + GKO_ASSERT(segment >= 0); + GKO_ASSERT(segment < num_segments()); return end_indices()[segment]; } From ad108c0d8e7b5627fa3c259dbb549ac2ce784fd2 Mon Sep 17 00:00:00 2001 From: nbeams <246972+nbeams@users.noreply.github.com> Date: Thu, 11 Jul 2024 02:48:55 +0000 Subject: [PATCH 114/448] Add CGS and CGS2 orthogonalization options to GMRES. Change Hessenberg data layout to facilitate CGS communication in distributed solver. --- .../unified/solver/common_gmres_kernels.cpp | 32 ++- common/unified/solver/gmres_kernels.cpp | 25 ++ core/device_hooks/common_kernels.inc.cpp | 1 + core/solver/gmres.cpp | 253 ++++++++++++++++-- core/solver/gmres_kernels.hpp | 18 +- core/test/config/solver.cpp | 3 + include/ginkgo/core/solver/gmres.hpp | 51 +++- reference/solver/common_gmres_kernels.cpp | 46 ++-- reference/solver/gmres_kernels.cpp | 21 ++ reference/test/solver/gmres_kernels.cpp | 100 +++++-- test/mpi/solver/solver.cpp | 8 +- test/solver/gmres_kernels.cpp | 86 ++++-- 12 files changed, 520 insertions(+), 124 deletions(-) diff --git a/common/unified/solver/common_gmres_kernels.cpp b/common/unified/solver/common_gmres_kernels.cpp index 0e6ba18bb64..15637fe701e 100644 --- a/common/unified/solver/common_gmres_kernels.cpp +++ b/common/unified/solver/common_gmres_kernels.cpp @@ -69,28 +69,30 @@ void hessenberg_qr(std::shared_ptr exec, exec, [] GKO_KERNEL(auto rhs, auto givens_sin, auto givens_cos, auto residual_norm, auto residual_norm_collection, - auto hessenberg_iter, auto iter, auto final_iter_nums, - auto stop_status) { + auto hessenberg_iter, auto iter, auto num_rhs, + auto final_iter_nums, auto stop_status) { using value_type = std::decay_t; if (stop_status[rhs].has_stopped()) { return; } // increment iteration count final_iter_nums[rhs]++; - auto hess_this = hessenberg_iter(0, rhs); - auto hess_next = hessenberg_iter(1, rhs); + auto hess_this = + hessenberg_iter(0, rhs); // hessenberg_iter(0, rhs); + auto hess_next = + hessenberg_iter(0, num_rhs + rhs); // hessenberg_iter(1, rhs); // apply previous Givens rotations to column for (decltype(iter) j = 0; j < iter; ++j) { // in here: hess_this = hessenberg_iter(j, rhs); // hess_next = hessenberg_iter(j+1, rhs); - hess_next = hessenberg_iter(j + 1, rhs); + hess_next = hessenberg_iter(0, (j + 1) * num_rhs + rhs); const auto gc = givens_cos(j, rhs); const auto gs = givens_sin(j, rhs); const auto out1 = gc * hess_this + gs * hess_next; const auto out2 = -conj(gs) * hess_this + conj(gc) * hess_next; - hessenberg_iter(j, rhs) = out1; - hessenberg_iter(j + 1, rhs) = hess_this = out2; - hess_next = hessenberg_iter(j + 2, rhs); + hessenberg_iter(0, j * num_rhs + rhs) = out1; + hessenberg_iter(0, (j + 1) * num_rhs + rhs) = hess_this = out2; + hess_next = hessenberg_iter(0, (j + 2) * num_rhs + rhs); } // hess_this is hessenberg_iter(iter, rhs) and // hess_next is hessenberg_iter(iter + 1, rhs) @@ -110,8 +112,9 @@ void hessenberg_qr(std::shared_ptr exec, givens_sin(iter, rhs) = gs = conj(hess_next) / hypotenuse; } // apply new Givens rotation to column - hessenberg_iter(iter, rhs) = gc * hess_this + gs * hess_next; - hessenberg_iter(iter + 1, rhs) = zero(); + hessenberg_iter(0, iter * num_rhs + rhs) = + gc * hess_this + gs * hess_next; + hessenberg_iter(0, (iter + 1) * num_rhs + rhs) = zero(); // apply new Givens rotation to RHS of least-squares problem const auto rnc_new = -conj(gs) * residual_norm_collection(iter, rhs); @@ -120,9 +123,9 @@ void hessenberg_qr(std::shared_ptr exec, gc * residual_norm_collection(iter, rhs); residual_norm(0, rhs) = abs(rnc_new); }, - hessenberg_iter->get_size()[1], givens_sin, givens_cos, residual_norm, - residual_norm_collection, hessenberg_iter, iter, final_iter_nums, - stop_status); + residual_norm->get_size()[1], givens_sin, givens_cos, residual_norm, + residual_norm_collection, hessenberg_iter, iter, + residual_norm->get_size()[1], final_iter_nums, stop_status); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -146,7 +149,8 @@ void solve_krylov(std::shared_ptr exec, for (int64 i = sizes[col] - 1; i >= 0; i--) { auto value = rhs(i, col); for (int64 j = i + 1; j < sizes[col]; j++) { - value -= mtx(i, j * num_cols + col) * y(j, col); + // i is the Krylov vector, j is Arnoldi iter + value -= mtx(j, i * num_cols + col) * y(j, col); } // y(i) = (rhs(i) - U(i,i+1:) * y(i+1:)) / U(i, i) y(i, col) = value / mtx(i, i * num_cols + col); diff --git a/common/unified/solver/gmres_kernels.cpp b/common/unified/solver/gmres_kernels.cpp index 3997963f8d7..c10dc2562e5 100644 --- a/common/unified/solver/gmres_kernels.cpp +++ b/common/unified/solver/gmres_kernels.cpp @@ -8,6 +8,7 @@ #include #include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" namespace gko { @@ -94,6 +95,30 @@ void multi_axpy(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL); +template +void multi_dot(std::shared_ptr exec, + const matrix::Dense* krylov_bases, + const matrix::Dense* next_krylov, + matrix::Dense* hessenberg_col) +{ + run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto row, auto col, auto bases, auto next_krylov, + auto num_rhs, auto num_rows) { + auto irhs = col % num_rhs; // which rhs + auto ivec = col / num_rhs; // which Krylov vector + return conj(bases(ivec * num_rows + row, irhs)) * + next_krylov(row, irhs); + }, + GKO_KERNEL_REDUCE_SUM(ValueType), hessenberg_col->get_values(), + gko::dim<2>{next_krylov->get_size()[0], + hessenberg_col->get_size()[1] - next_krylov->get_size()[1]}, + krylov_bases, next_krylov, next_krylov->get_size()[1], + next_krylov->get_size()[0]); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL); + } // namespace gmres } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index f5dc92ce16e..1ba925e94e3 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -553,6 +553,7 @@ namespace gmres { GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_RESTART_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL); } // namespace gmres diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index cd3d88a5c02..f6fb254cf94 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -33,9 +33,25 @@ GKO_REGISTER_OPERATION(restart, gmres::restart); GKO_REGISTER_OPERATION(hessenberg_qr, common_gmres::hessenberg_qr); GKO_REGISTER_OPERATION(solve_krylov, common_gmres::solve_krylov); GKO_REGISTER_OPERATION(multi_axpy, gmres::multi_axpy); +GKO_REGISTER_OPERATION(multi_dot, gmres::multi_dot); } // anonymous namespace + + +std::ostream& operator<<(std::ostream& stream, orthog_method orthog) +{ + switch (orthog) { + case orthog_method::mgs: + return stream << "mgs"; + case orthog_method::cgs: + return stream << "cgs"; + case orthog_method::cgs2: + return stream << "cgs2"; + } + return stream; +} + } // namespace gmres @@ -52,6 +68,20 @@ typename Gmres::parameters_type Gmres::parse( if (auto& obj = config.get("flexible")) { params.with_flexible(gko::config::get_value(obj)); } + if (auto& obj = config.get("orthog_method")) { + auto str = obj.get_string(); + gmres::orthog_method orthog; + if (str == "mgs") { + orthog = gmres::orthog_method::mgs; + } else if (str == "cgs") { + orthog = gmres::orthog_method::cgs; + } else if (str == "cgs2") { + orthog = gmres::orthog_method::cgs2; + } else { + GKO_INVALID_CONFIG_VALUE("orthog_method", str); + } + params.with_orthog_method(orthog); + } return params; } @@ -112,6 +142,155 @@ struct help_compute_norm { } }; +namespace { +// Orthogonalization helper functions +template +void orthogonalize_mgs(matrix::Dense* hessenberg_iter, + VectorType* krylov_bases, VectorType* next_krylov, + array& reduction_tmp, size_type restart_iter, + size_type num_rows, size_type num_rhs, + size_type local_num_rows) +{ + for (size_type i = 0; i <= restart_iter; i++) { + // orthogonalize against krylov_bases(:, i): + // hessenberg(i, restart_iter) = next_krylov' * krylov_bases(:, + // i) next_krylov -= hessenberg(i, restart_iter) * + // krylov_bases(:, i) + auto hessenberg_entry = hessenberg_iter->create_submatrix( + span{0, 1}, span{i * num_rhs, (i + 1) * num_rhs}); + auto krylov_basis = ::gko::detail::create_submatrix_helper( + krylov_bases, dim<2>{num_rows, num_rhs}, + span{local_num_rows * i, local_num_rows * (i + 1)}, + span{0, num_rhs}); + next_krylov->compute_conj_dot(krylov_basis, hessenberg_entry, + reduction_tmp); + next_krylov->sub_scaled(hessenberg_entry, krylov_basis); + } +} + +template +void finish_reduce(matrix::Dense* hessenberg_iter, + matrix::Dense* next_krylov, + const size_type num_rhs, const size_type restart_iter) +{ + return; +} + +#if GINKGO_BUILD_MPI +template +void finish_reduce(matrix::Dense* hessenberg_iter, + experimental::distributed::Vector* next_krylov, + const size_type num_rhs, const size_type restart_iter) +{ + auto exec = hessenberg_iter->get_executor(); + const auto comm = next_krylov->get_communicator(); + exec->synchronize(); + // hessenberg_iter is the size of all non-zeros for this iteration -- but we + // are not setting the last values for each rhs (values that would be below + // the diagonal in the "full" matrix. + auto hessenberg_reduce = hessenberg_iter->create_submatrix( + span{0, 1}, span{0, num_rhs * (restart_iter + 1)}); + if (experimental::mpi::requires_host_buffer(exec, comm)) { + ::gko::detail::DenseCache host_reduction_buffer; + host_reduction_buffer.init(exec->get_master(), + hessenberg_reduce->get_size()); + host_reduction_buffer->copy_from(hessenberg_reduce); + comm.all_reduce(exec->get_master(), host_reduction_buffer->get_values(), + static_cast(hessenberg_reduce->get_size()[1]), + MPI_SUM); + hessenberg_reduce->copy_from(host_reduction_buffer.get()); + } else { + comm.all_reduce(exec, hessenberg_reduce->get_values(), + static_cast(hessenberg_reduce->get_size()[1]), + MPI_SUM); + } +} +#endif + +template +void orthogonalize_cgs(matrix::Dense* hessenberg_iter, + VectorType* krylov_bases, VectorType* next_krylov, + size_type restart_iter, size_type num_rows, + size_type num_rhs, size_type local_num_rows) +{ + auto exec = hessenberg_iter->get_executor(); + // hessenberg(0:restart_iter, restart_iter) = krylov_basis' * + // next_krylov + auto krylov_basis_small = ::gko::detail::create_submatrix_helper( + krylov_bases, dim<2>{num_rows, num_rhs}, + span{0, local_num_rows * (restart_iter + 1)}, span{0, num_rhs}); + exec->run(gmres::make_multi_dot( + gko::detail::get_local(krylov_basis_small.get()), + gko::detail::get_local(next_krylov), hessenberg_iter)); + finish_reduce(hessenberg_iter, next_krylov, num_rhs, restart_iter); + for (size_type i = 0; i <= restart_iter; i++) { + // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:, + // i) + auto hessenberg_entry = hessenberg_iter->create_submatrix( + span{0, 1}, span{i * num_rhs, (i + 1) * num_rhs}); + auto krylov_col = ::gko::detail::create_submatrix_helper( + krylov_bases, dim<2>{num_rows, num_rhs}, + span{local_num_rows * i, local_num_rows * (i + 1)}, + span{0, num_rhs}); + next_krylov->sub_scaled(hessenberg_entry, krylov_col); + } +} + + +template +void orthogonalize_cgs2(matrix::Dense* hessenberg_iter, + VectorType* krylov_bases, VectorType* next_krylov, + matrix::Dense* hessenberg_aux, + matrix::Dense* one_op, + size_type restart_iter, size_type num_rows, + size_type num_rhs, size_type local_num_rows) +{ + auto exec = hessenberg_iter->get_executor(); + // hessenberg(0:restart_iter, restart_iter) = krylov_bases' * + // next_krylov + auto krylov_basis_small = ::gko::detail::create_submatrix_helper( + krylov_bases, dim<2>{num_rows, num_rhs}, + span{0, local_num_rows * (restart_iter + 1)}, span{0, num_rhs}); + exec->run(gmres::make_multi_dot( + gko::detail::get_local(krylov_basis_small.get()), + gko::detail::get_local(next_krylov), hessenberg_iter)); + finish_reduce(hessenberg_iter, next_krylov, num_rhs, restart_iter); + for (size_type i = 0; i <= restart_iter; i++) { + // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:, + // i) + auto hessenberg_entry = hessenberg_iter->create_submatrix( + span{0, 1}, span{i * num_rhs, (i + 1) * num_rhs}); + auto krylov_col = ::gko::detail::create_submatrix_helper( + krylov_bases, dim<2>{num_rows, num_rhs}, + span{local_num_rows * i, local_num_rows * (i + 1)}, + span{0, num_rhs}); + next_krylov->sub_scaled(hessenberg_entry, krylov_col); + } + // Re-orthogonalize + auto hessenberg_aux_iter = hessenberg_aux->create_submatrix( + span{0, 1}, span{0, (restart_iter + 2) * num_rhs}); + exec->run(gmres::make_multi_dot( + gko::detail::get_local(krylov_basis_small.get()), + gko::detail::get_local(next_krylov), hessenberg_aux_iter.get())); + finish_reduce(hessenberg_aux_iter.get(), next_krylov, num_rhs, + restart_iter); + + for (size_type i = 0; i <= restart_iter; i++) { + // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:, + // i) + auto hessenberg_entry = hessenberg_aux->create_submatrix( + span{0, 1}, span{i * num_rhs, (i + 1) * num_rhs}); + auto krylov_col = ::gko::detail::create_submatrix_helper( + krylov_bases, dim<2>{num_rows, num_rhs}, + span{local_num_rows * i, local_num_rows * (i + 1)}, + span{0, num_rhs}); + next_krylov->sub_scaled(hessenberg_entry, krylov_col); + } + // Add both Hessenberg columns + hessenberg_iter->add_scaled(one_op, hessenberg_aux_iter); +} +} // anonymous namespace + template struct help_compute_norm::value>> { @@ -127,7 +306,6 @@ struct help_compute_norm template void Gmres::apply_dense_impl(const VectorType* dense_b, @@ -161,9 +339,23 @@ void Gmres::apply_dense_impl(const VectorType* dense_b, dim<2>{num_rows * (krylov_dim + 1), num_rhs}, dim<2>{local_num_rows * (krylov_dim + 1), num_rhs}); } - // rows: rows of Hessenberg matrix, columns: block for each entry + // The Hessenberg matrix formed by the Arnoldi process is of shape + // (krylov_dim + 1) x (krylov_dim) for a single RHS. The (i,j)th + // entry is associated with the ith Krylov basis vector and the jth + // iteration of Arnoldi. + // For ease of using the reduction kernels locally and for having + // contiguous memory for communicating in the distributed case, we + // will store the Hessenberg matrix in the shape + // (krylov_dim) x ((krylov_dim + 1) * num_rhs), where the (i,j)th + // entry is associated with the ith iteration and the (j/num_rhs)th + // Krylov basis vector, for the (j % num_rhs)th RHS vector. auto hessenberg = this->template create_workspace_op( - ws::hessenberg, dim<2>{krylov_dim + 1, krylov_dim * num_rhs}); + ws::hessenberg, dim<2>{krylov_dim, (krylov_dim + 1) * num_rhs}); + LocalVector* hessenberg_aux = nullptr; + if (this->parameters_.orthog_method == gmres::orthog_method::cgs2) { + hessenberg_aux = this->template create_workspace_op( + ws::hessenberg_aux, dim<2>{1, (krylov_dim + 1) * num_rhs}); + } auto givens_sin = this->template create_workspace_op( ws::givens_sin, dim<2>{krylov_dim, num_rhs}); auto givens_cos = this->template create_workspace_op( @@ -312,36 +504,39 @@ void Gmres::apply_dense_impl(const VectorType* dense_b, this->get_preconditioner()->apply(this_krylov, preconditioned_krylov_vector); - // Create view of current column in the hessenberg matrix: - // hessenberg_iter = hessenberg(:, restart_iter); - auto hessenberg_iter = hessenberg->create_submatrix( - span{0, restart_iter + 2}, - span{num_rhs * restart_iter, num_rhs * (restart_iter + 1)}); + // Create view of current "column" in the hessenberg matrix: + // hessenberg_iter = hessenberg(:, restart_iter), which + // is actually stored as a row, hessenberg(restart_iter, :) + auto hessenberg_iter = + hessenberg->create_submatrix(span{restart_iter, restart_iter + 1}, + span{0, num_rhs * (restart_iter + 2)}); // Start of Arnoldi // next_krylov = A * preconditioned_krylov_vector this->get_system_matrix()->apply(preconditioned_krylov_vector, next_krylov); - - for (size_type i = 0; i <= restart_iter; i++) { - // orthogonalize against krylov_bases(:, i): - // hessenberg(i, restart_iter) = next_krylov' * krylov_bases(:, i) - // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:, i) - auto hessenberg_entry = hessenberg_iter->create_submatrix( - span{i, i + 1}, span{0, num_rhs}); - auto krylov_basis = ::gko::detail::create_submatrix_helper( - krylov_bases, dim<2>{num_rows, num_rhs}, - span{local_num_rows * i, local_num_rows * (i + 1)}, - span{0, num_rhs}); - next_krylov->compute_conj_dot(krylov_basis, hessenberg_entry, - reduction_tmp); - next_krylov->sub_scaled(hessenberg_entry, krylov_basis); + if (this->parameters_.orthog_method == gmres::orthog_method::mgs) { + orthogonalize_mgs(hessenberg_iter.get(), krylov_bases, + next_krylov.get(), reduction_tmp, restart_iter, + num_rows, num_rhs, local_num_rows); + } else if (this->parameters_.orthog_method == + gmres::orthog_method::cgs) { + orthogonalize_cgs(hessenberg_iter.get(), krylov_bases, + next_krylov.get(), restart_iter, num_rows, + num_rhs, local_num_rows); + } else if (this->parameters_.orthog_method == + gmres::orthog_method::cgs2) { + orthogonalize_cgs2(hessenberg_iter.get(), krylov_bases, + next_krylov.get(), hessenberg_aux, one_op, + restart_iter, num_rows, num_rhs, local_num_rows); } // normalize next_krylov: // hessenberg(restart_iter+1, restart_iter) = norm(next_krylov) + // (stored in hessenberg(restart_iter, (restart_iter + 1) * num_rhs)) // next_krylov /= hessenberg(restart_iter+1, restart_iter) auto hessenberg_norm_entry = hessenberg_iter->create_submatrix( - span{restart_iter + 1, restart_iter + 2}, span{0, num_rhs}); + span{0, 1}, + span{(restart_iter + 1) * num_rhs, (restart_iter + 2) * num_rhs}); help_compute_norm::compute_next_krylov_norm_into_hessenberg( next_krylov.get(), hessenberg_norm_entry.get(), next_krylov_norm_tmp, reduction_tmp); @@ -379,7 +574,7 @@ void Gmres::apply_dense_impl(const VectorType* dense_b, } auto hessenberg_small = hessenberg->create_submatrix( - span{0, restart_iter}, span{0, num_rhs * (restart_iter)}); + span{0, restart_iter}, span{0, num_rhs * restart_iter}); // Solve upper triangular. // y = hessenberg \ residual_norm_collection @@ -443,7 +638,7 @@ int workspace_traits>::num_arrays(const Solver&) template int workspace_traits>::num_vectors(const Solver&) { - return 15; + return 16; } @@ -455,6 +650,7 @@ std::vector workspace_traits>::op_names( "preconditioned_vector", "krylov_bases", "hessenberg", + "hessenberg_aux", "givens_sin", "givens_cos", "residual_norm_collection", @@ -480,10 +676,9 @@ std::vector workspace_traits>::array_names( template std::vector workspace_traits>::scalars(const Solver&) { - return {hessenberg, givens_sin, - givens_cos, residual_norm_collection, - residual_norm, y, - next_krylov_norm_tmp}; + return {hessenberg, hessenberg_aux, givens_sin, + givens_cos, residual_norm_collection, residual_norm, + y, next_krylov_norm_tmp}; } diff --git a/core/solver/gmres_kernels.hpp b/core/solver/gmres_kernels.hpp index 196b0de3ab0..f9fbe76279b 100644 --- a/core/solver/gmres_kernels.hpp +++ b/core/solver/gmres_kernels.hpp @@ -38,11 +38,19 @@ namespace gmres { stopping_status* stop_status) -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_GMRES_RESTART_KERNEL(ValueType); \ - template \ - GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL(ValueType) +#define GKO_DECLARE_GMRES_MULTI_DOT_KERNEL(_type) \ + void multi_dot(std::shared_ptr exec, \ + const matrix::Dense<_type>* krylov_bases, \ + const matrix::Dense<_type>* next_krylov, \ + matrix::Dense<_type>* hessenberg_col) + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_GMRES_RESTART_KERNEL(ValueType); \ + template \ + GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL(ValueType); \ + template \ + GKO_DECLARE_GMRES_MULTI_DOT_KERNEL(ValueType) } // namespace gmres diff --git a/core/test/config/solver.cpp b/core/test/config/solver.cpp index 8a2f025d00a..78f1f7351f8 100644 --- a/core/test/config/solver.cpp +++ b/core/test/config/solver.cpp @@ -289,6 +289,8 @@ struct Gmres param.with_krylov_dim(3u); config_map["flexible"] = pnode{true}; param.with_flexible(true); + config_map["orthog_method"] = pnode{"cgs"}; + param.with_orthog_method(gko::solver::gmres::orthog_method::cgs); } template @@ -300,6 +302,7 @@ struct Gmres solver_config_test::template validate(result, answer); ASSERT_EQ(res_param.krylov_dim, ans_param.krylov_dim); ASSERT_EQ(res_param.flexible, ans_param.flexible); + ASSERT_EQ(res_param.orthog_method, ans_param.orthog_method); } }; diff --git a/include/ginkgo/core/solver/gmres.hpp b/include/ginkgo/core/solver/gmres.hpp index 57bbca0b529..308dadf5218 100644 --- a/include/ginkgo/core/solver/gmres.hpp +++ b/include/ginkgo/core/solver/gmres.hpp @@ -31,6 +31,29 @@ namespace solver { constexpr size_type gmres_default_krylov_dim = 100u; +namespace gmres { +/** + * Set the orthogonalization method for the Krylov subspace. + */ +enum class orthog_method { + /** + * Modified Gram-Schmidt (default) + */ + mgs, + /** + * Classical Gram-Schmidt + */ + cgs, + /** + * Classical Gram-Schmidt with re-orthogonalization + */ + cgs2 +}; + +/** Prints an orthogonalization method. */ +std::ostream& operator<<(std::ostream& stream, orthog_method orthog); + +} // namespace gmres /** * GMRES or the generalized minimal residual method is an iterative type Krylov @@ -93,6 +116,10 @@ class Gmres /** Flexible GMRES */ bool GKO_FACTORY_PARAMETER_SCALAR(flexible, false); + + /** Orthogonalization method */ + gmres::orthog_method GKO_FACTORY_PARAMETER_SCALAR( + orthog_method, gmres::orthog_method::mgs); }; GKO_ENABLE_LIN_OP_FACTORY(Gmres, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); @@ -167,28 +194,30 @@ struct workspace_traits> { constexpr static int krylov_bases = 2; // hessenberg matrix constexpr static int hessenberg = 3; + // auxiliary space for CGS2 + constexpr static int hessenberg_aux = 4; // givens sin parameters - constexpr static int givens_sin = 4; + constexpr static int givens_sin = 5; // givens cos parameters - constexpr static int givens_cos = 5; + constexpr static int givens_cos = 6; // coefficients of the residual in Krylov space - constexpr static int residual_norm_collection = 6; + constexpr static int residual_norm_collection = 7; // residual norm scalar - constexpr static int residual_norm = 7; + constexpr static int residual_norm = 8; // solution of the least-squares problem in Krylov space - constexpr static int y = 8; + constexpr static int y = 9; // solution of the least-squares problem mapped to the full space - constexpr static int before_preconditioner = 9; + constexpr static int before_preconditioner = 10; // preconditioned solution of the least-squares problem - constexpr static int after_preconditioner = 10; + constexpr static int after_preconditioner = 11; // constant 1.0 scalar - constexpr static int one = 11; + constexpr static int one = 12; // constant -1.0 scalar - constexpr static int minus_one = 12; + constexpr static int minus_one = 13; // temporary norm vector of next_krylov to copy into hessenberg matrix - constexpr static int next_krylov_norm_tmp = 13; + constexpr static int next_krylov_norm_tmp = 14; // preconditioned krylov basis multivector - constexpr static int preconditioned_krylov_bases = 14; + constexpr static int preconditioned_krylov_bases = 15; // stopping status array constexpr static int stop = 0; diff --git a/reference/solver/common_gmres_kernels.cpp b/reference/solver/common_gmres_kernels.cpp index 643c164b828..122c224d5c1 100644 --- a/reference/solver/common_gmres_kernels.cpp +++ b/reference/solver/common_gmres_kernels.cpp @@ -30,14 +30,15 @@ template void calculate_sin_and_cos(matrix::Dense* givens_sin, matrix::Dense* givens_cos, matrix::Dense* hessenberg_iter, - size_type iter, const size_type rhs) + size_type iter, const size_type num_rhs, + const size_type rhs) { - if (is_zero(hessenberg_iter->at(iter, rhs))) { + if (is_zero(hessenberg_iter->at(0, iter * num_rhs + rhs))) { givens_cos->at(iter, rhs) = zero(); givens_sin->at(iter, rhs) = one(); } else { - auto this_hess = hessenberg_iter->at(iter, rhs); - auto next_hess = hessenberg_iter->at(iter + 1, rhs); + auto this_hess = hessenberg_iter->at(0, iter * num_rhs + rhs); + auto next_hess = hessenberg_iter->at(0, (iter + 1) * num_rhs + rhs); const auto scale = abs(this_hess) + abs(next_hess); const auto hypotenuse = scale * sqrt(abs(this_hess / scale) * abs(this_hess / scale) + @@ -52,19 +53,24 @@ template void givens_rotation(matrix::Dense* givens_sin, matrix::Dense* givens_cos, matrix::Dense* hessenberg_iter, size_type iter, + const size_type num_rhs, const stopping_status* stop_status) { - for (size_type i = 0; i < hessenberg_iter->get_size()[1]; ++i) { + for (size_type i = 0; i < num_rhs; ++i) { if (stop_status[i].has_stopped()) { continue; } for (size_type j = 0; j < iter; ++j) { - auto temp = givens_cos->at(j, i) * hessenberg_iter->at(j, i) + - givens_sin->at(j, i) * hessenberg_iter->at(j + 1, i); - hessenberg_iter->at(j + 1, i) = - -conj(givens_sin->at(j, i)) * hessenberg_iter->at(j, i) + - conj(givens_cos->at(j, i)) * hessenberg_iter->at(j + 1, i); - hessenberg_iter->at(j, i) = temp; + auto temp = + givens_cos->at(j, i) * hessenberg_iter->at(0, j * num_rhs + i) + + givens_sin->at(j, i) * + hessenberg_iter->at(0, (j + 1) * num_rhs + i); + hessenberg_iter->at(0, (j + 1) * num_rhs + i) = + -conj(givens_sin->at(j, i)) * + hessenberg_iter->at(0, j * num_rhs + i) + + conj(givens_cos->at(j, i)) * + hessenberg_iter->at(0, (j + 1) * num_rhs + i); + hessenberg_iter->at(0, j * num_rhs + i) = temp; // temp = cos(j)*hessenberg(j) + // sin(j)*hessenberg(j+1) // hessenberg(j+1) = -conj(sin(j))*hessenberg(j) + @@ -72,12 +78,15 @@ void givens_rotation(matrix::Dense* givens_sin, // hessenberg(j) = temp; } - calculate_sin_and_cos(givens_sin, givens_cos, hessenberg_iter, iter, i); + calculate_sin_and_cos(givens_sin, givens_cos, hessenberg_iter, iter, + num_rhs, i); - hessenberg_iter->at(iter, i) = - givens_cos->at(iter, i) * hessenberg_iter->at(iter, i) + - givens_sin->at(iter, i) * hessenberg_iter->at(iter + 1, i); - hessenberg_iter->at(iter + 1, i) = zero(); + hessenberg_iter->at(0, iter * num_rhs + i) = + givens_cos->at(iter, i) * + hessenberg_iter->at(0, iter * num_rhs + i) + + givens_sin->at(iter, i) * + hessenberg_iter->at(0, (iter + 1) * num_rhs + i); + hessenberg_iter->at(0, (iter + 1) * num_rhs + i) = zero(); // hessenberg(iter) = cos(iter)*hessenberg(iter) + // sin(iter)*hessenberg(iter + 1) // hessenberg(iter+1) = 0 @@ -151,7 +160,8 @@ void hessenberg_qr(std::shared_ptr exec, } } - givens_rotation(givens_sin, givens_cos, hessenberg_iter, iter, stop_status); + givens_rotation(givens_sin, givens_cos, hessenberg_iter, iter, + residual_norm->get_size()[1], stop_status); calculate_next_residual_norm(givens_sin, givens_cos, residual_norm, residual_norm_collection, iter, stop_status); } @@ -176,7 +186,7 @@ void solve_krylov(std::shared_ptr exec, for (size_type j = i + 1; j < final_iter_nums[k]; ++j) { temp -= hessenberg->at( - i, j * residual_norm_collection->get_size()[1] + k) * + j, i * residual_norm_collection->get_size()[1] + k) * y->at(j, k); } y->at(i, k) = diff --git a/reference/solver/gmres_kernels.cpp b/reference/solver/gmres_kernels.cpp index a0b22862998..4c482632353 100644 --- a/reference/solver/gmres_kernels.cpp +++ b/reference/solver/gmres_kernels.cpp @@ -71,6 +71,27 @@ void multi_axpy(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_AXPY_KERNEL); +template +void multi_dot(std::shared_ptr exec, + const matrix::Dense* krylov_bases, + const matrix::Dense* next_krylov, + matrix::Dense* hessenberg_col) +{ + auto num_rhs = next_krylov->get_size()[1]; + auto krylov_bases_rowoffset = next_krylov->get_size()[0]; + for (size_type i = 0; i < hessenberg_col->get_size()[1]; ++i) { + auto ivec = i / num_rhs; + auto irhs = i % num_rhs; + hessenberg_col->at(0, i) = zero(); + for (size_type j = 0; j < krylov_bases_rowoffset; ++j) { + hessenberg_col->at(0, i) += + krylov_bases->at(ivec * krylov_bases_rowoffset + j, irhs) * + next_krylov->at(j, irhs); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GMRES_MULTI_DOT_KERNEL); } // namespace gmres } // namespace reference diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp index 00f7766179f..bc877e0ed76 100644 --- a/reference/test/solver/gmres_kernels.cpp +++ b/reference/test/solver/gmres_kernels.cpp @@ -102,7 +102,7 @@ class Gmres : public ::testing::Test { small_y = Mtx::create(exec, gko::dim<2>{small_restart, small_size[1]}); small_hessenberg = Mtx::create( exec, - gko::dim<2>{small_restart + 1, small_restart * small_size[1]}); + gko::dim<2>{small_restart, (small_restart + 1) * small_size[1]}); small_hessenberg->fill(gko::zero()); stopped.converge(1, true); @@ -222,8 +222,8 @@ TYPED_TEST(Gmres, KernelHessenbergQrIter0) this->small_residual_norm->fill(nan); this->small_residual_norm_collection = gko::initialize( {I{1.25, 1.5}, I{nan, nan}, I{95., 94.}}, this->exec); - this->small_hessenberg = gko::initialize( - {I{0.5, -0.75}, I{-0.5, 1}, I{97., 96.}}, this->exec); + this->small_hessenberg = + gko::initialize({I{0.5, -0.75, -0.5, 1, 97., 96.}}, this->exec); this->small_final_iter_nums.get_data()[0] = 0; this->small_final_iter_nums.get_data()[1] = 0; @@ -242,7 +242,7 @@ TYPED_TEST(Gmres, KernelHessenbergQrIter0) GKO_EXPECT_MTX_NEAR(this->small_givens_sin, l({{-0.5 * sqrt(2.), 0.8}, {-72., 73.}}), r::value); GKO_EXPECT_MTX_NEAR(this->small_hessenberg, - l({{0.5 * sqrt(2.), 1.25}, {0., 0.}, {97., 96.}}), + l({{0.5 * sqrt(2.), 1.25, 0., 0., 97., 96.}}), r::value); GKO_EXPECT_MTX_NEAR( this->small_residual_norm_collection, @@ -267,8 +267,8 @@ TYPED_TEST(Gmres, KernelHessenbergQrIter1) this->small_residual_norm->fill(nan); this->small_residual_norm_collection = gko::initialize( {I{95., 94.}, I{1.25, 1.5}, I{nan, nan}}, this->exec); - this->small_hessenberg = gko::initialize( - {I{-0.5, 4}, I{0.25, 0.5}, I{-0.5, 1}}, this->exec); + this->small_hessenberg = + gko::initialize({I{-0.5, 4, 0.25, 0.5, -0.5, 1}}, this->exec); this->small_final_iter_nums.get_data()[0] = 1; this->small_final_iter_nums.get_data()[1] = 1; @@ -287,7 +287,7 @@ TYPED_TEST(Gmres, KernelHessenbergQrIter1) GKO_EXPECT_MTX_NEAR(this->small_givens_sin, l({{0.5, 0.25}, {-0.5 * sqrt(2.), 0.8}}), r::value); GKO_EXPECT_MTX_NEAR(this->small_hessenberg, - l({{-0.375, 2.125}, {0.5 * sqrt(2.), 1.25}, {0., 0.}}), + l({{-0.375, 2.125, 0.5 * sqrt(2.), 1.25, 0., 0.}}), r::value); GKO_EXPECT_MTX_NEAR( this->small_residual_norm_collection, @@ -309,9 +309,8 @@ TYPED_TEST(Gmres, KernelSolveKrylov) this->small_final_iter_nums.get_data()[1] = restart; this->small_hessenberg = gko::initialize( // clang-format off - {{-1, 3, 2, -4}, - {0, 0, 1, 5}, - {nan, nan, nan, nan}}, + {{-1, 3, 0, 0, nan, nan}, + {2, -4, 1, 5, nan, nan}}, // clang-format on this->exec); this->small_residual_norm_collection = @@ -366,6 +365,40 @@ TYPED_TEST(Gmres, KernelMultiAxpy) r::value); } +TYPED_TEST(Gmres, KernelMultiDot) +{ + using T = typename TestFixture::value_type; + using Mtx = typename TestFixture::Mtx; + const T nan = std::numeric_limits>::quiet_NaN(); + const auto restart = this->small_givens_sin->get_size()[0]; + this->small_hessenberg->fill(gko::zero()); + auto hessenberg_iter = this->small_hessenberg->create_submatrix( + gko::span{0, 1}, + gko::span{0, (restart + 1) * this->small_x->get_size()[1]}); + this->small_x = gko::initialize( // next_krylov + {I{-1.0, 2.3}, I{-14.0, -22.0}, I{8.4, 14.2}}, this->exec); + + this->small_krylov_bases = gko::initialize( // restart+1 x rows x #rhs + { + I{1, 10}, // 0, 0, x + I{2, 11}, // 0, 1, x + I{3, 12}, // 0, 2, x + I{4, 13}, // 1, 0, x + I{5, 14}, // 1, 1, x + I{6, 15}, // 1, 2, x + I{7, 16}, // 2, 0, x + I{8, 17}, // 2, 1, x + I{9, 18}, // 2, 2, x + }, + this->exec); + gko::kernels::reference::gmres::multi_dot( + this->exec, this->small_krylov_bases.get(), this->small_x.get(), + hessenberg_iter.get()); + + GKO_ASSERT_MTX_NEAR(hessenberg_iter, + l({{-3.8, -48.6, -23.6, -65.1, -43.4, -81.6}}), + r::value); +} TYPED_TEST(Gmres, SolvesStencilSystem) { @@ -703,28 +736,37 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem1WithRestart) TYPED_TEST(Gmres, SolvesWithPreconditioner) { + using gko::solver::gmres::orthog_method; + using Mtx = typename TestFixture::Mtx; using Solver = typename TestFixture::Solver; using value_type = typename TestFixture::value_type; - auto gmres_factory_preconditioner = - Solver::build() - .with_criteria(gko::stop::Iteration::build().with_max_iters(100u), - gko::stop::ResidualNorm::build() - .with_reduction_factor(r::value)) - .with_preconditioner( - gko::preconditioner::Jacobi::build() - .with_max_block_size(3u)) - .on(this->exec); - auto solver = gmres_factory_preconditioner->generate(this->mtx_big); - auto b = gko::initialize( - {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90}, - this->exec); - auto x = gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); - - solver->apply(b, x); - - GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}), - r::value * 1e3); + for (auto orthog : + {orthog_method::mgs, orthog_method::cgs, orthog_method::cgs2}) { + SCOPED_TRACE(orthog); + auto gmres_factory_preconditioner = + Solver::build() + .with_orthog_method(orthog) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(100u), + gko::stop::ResidualNorm::build() + .with_reduction_factor(r::value)) + .with_preconditioner( + gko::preconditioner::Jacobi::build() + .with_max_block_size(3u)) + .on(this->exec); + auto solver = gmres_factory_preconditioner->generate(this->mtx_big); + auto b = gko::initialize( + {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90}, + this->exec); + auto x = + gko::initialize({0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, this->exec); + + solver->apply(b, x); + + GKO_ASSERT_MTX_NEAR(x, l({33.0, -56.0, 81.0, -30.0, 21.0, 40.0}), + r::value * 1e3); + } } diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp index 589be91bcba..aaf61cb47ea 100644 --- a/test/mpi/solver/solver.cpp +++ b/test/mpi/solver/solver.cpp @@ -195,13 +195,14 @@ struct Ir : SimpleSolverTest> { }; -template +template struct Gmres : SimpleSolverTest> { static typename solver_type::parameters_type build( std::shared_ptr exec) { return SimpleSolverTest>::build( std::move(exec)) + .with_orthog_method(orthog) .with_krylov_dim(dimension); } }; @@ -531,7 +532,10 @@ class Solver : public CommonMpiTestFixture { using SolverTypes = ::testing::Types, Gcr<100u>, - Gmres<10u>, Gmres<100u>>; + Gmres<10u, gko::solver::gmres::orthog_method::mgs>, + Gmres<10u, gko::solver::gmres::orthog_method::cgs>, + Gmres<10u, gko::solver::gmres::orthog_method::cgs2>, + Gmres<100u, gko::solver::gmres::orthog_method::mgs>>; TYPED_TEST_SUITE(Solver, SolverTypes, TypenameNameGenerator); diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp index a6c74bd45c0..fb2eab5c040 100644 --- a/test/solver/gmres_kernels.cpp +++ b/test/solver/gmres_kernels.cpp @@ -74,10 +74,11 @@ class Gmres : public CommonTestFixture { b = gen_mtx(m, nrhs); krylov_bases = gen_mtx(m * (gko::solver::gmres_default_krylov_dim + 1), nrhs); - hessenberg = gen_mtx(gko::solver::gmres_default_krylov_dim + 1, - gko::solver::gmres_default_krylov_dim * nrhs); + hessenberg = + gen_mtx(gko::solver::gmres_default_krylov_dim, + (gko::solver::gmres_default_krylov_dim + 1) * nrhs); hessenberg_iter = - gen_mtx(gko::solver::gmres_default_krylov_dim + 1, nrhs); + gen_mtx(1, (gko::solver::gmres_default_krylov_dim + 1) * nrhs); residual = gen_mtx(m, nrhs); residual_norm = gen_mtx(1, nrhs); residual_norm_collection = @@ -272,6 +273,50 @@ TEST_F(Gmres, GmresKernelMultiAxpyIsEquivalentToRef) GKO_ASSERT_ARRAY_EQ(stop_status, d_stop_status); } +TEST_F(Gmres, GmresKernelMultiDotIsEquivalentToRef) +{ + initialize_data(); + + auto krylov_basis = krylov_bases->create_submatrix( + gko::span{ + 0, x->get_size()[0] * (gko::solver::gmres_default_krylov_dim - 1)}, + gko::span{0, x->get_size()[1]}); + auto d_krylov_basis = d_krylov_bases->create_submatrix( + gko::span{0, d_x->get_size()[0] * + (gko::solver::gmres_default_krylov_dim - 1)}, + gko::span{0, d_x->get_size()[1]}); + auto next_krylov = krylov_bases->create_submatrix( + gko::span{ + x->get_size()[0] * (gko::solver::gmres_default_krylov_dim - 1), + x->get_size()[0] * gko::solver::gmres_default_krylov_dim}, + gko::span{0, x->get_size()[1]}); + auto d_next_krylov = d_krylov_bases->create_submatrix( + gko::span{ + d_x->get_size()[0] * (gko::solver::gmres_default_krylov_dim - 1), + d_x->get_size()[0] * gko::solver::gmres_default_krylov_dim}, + gko::span{0, d_x->get_size()[1]}); + gko::kernels::reference::gmres::multi_dot( + ref, krylov_basis.get(), next_krylov.get(), hessenberg_iter.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::gmres::multi_dot( + exec, d_krylov_basis.get(), d_next_krylov.get(), + d_hessenberg_iter.get()); + + // The multidot computation does not set the value below the diagonal + // in the Hessenberg matrix column(s), as that is done after the + // orthogonalization of the next basis vector. In this test, we + // are checking the column(s) created on the last iteration before the + // solver's restart would be triggered, so it is only the final row of + // the Hessenberg column(s) that we ignore. + auto hessenberg_iter_small = hessenberg_iter->create_submatrix( + gko::span{0, 1}, + gko::span{0, gko::solver::gmres_default_krylov_dim * x->get_size()[1]}); + auto d_hessenberg_iter_small = d_hessenberg_iter->create_submatrix( + gko::span{0, 1}, + gko::span{0, gko::solver::gmres_default_krylov_dim * x->get_size()[1]}); + GKO_ASSERT_MTX_NEAR(d_hessenberg_iter_small, hessenberg_iter_small, + r::value); +} + TEST_F(Gmres, GmresApplyOneRHSIsEquivalentToRef) { @@ -294,18 +339,27 @@ TEST_F(Gmres, GmresApplyOneRHSIsEquivalentToRef) TEST_F(Gmres, GmresApplyMultipleRHSIsEquivalentToRef) { - int m = 123; - int n = 5; - auto ref_solver = ref_gmres_factory->generate(mtx); - auto exec_solver = exec_gmres_factory->generate(d_mtx); - auto b = gen_mtx(m, n); - auto x = gen_mtx(m, n); - auto d_b = gko::clone(exec, b); - auto d_x = gko::clone(exec, x); - - ref_solver->apply(b, x); - exec_solver->apply(d_b, d_x); + using gko::solver::gmres::orthog_method; + auto base_params = gko::clone(ref, ref_gmres_factory)->get_parameters(); - GKO_ASSERT_MTX_NEAR(d_b, b, 0); - GKO_ASSERT_MTX_NEAR(d_x, x, r::value * 1e3); + for (auto orthog : + {orthog_method::mgs, orthog_method::cgs, orthog_method::cgs2}) { + SCOPED_TRACE(orthog); + int m = 123; + int n = 5; + auto ref_solver = + base_params.with_orthog_method(orthog).on(ref)->generate(mtx); + auto exec_solver = + base_params.with_orthog_method(orthog).on(exec)->generate(d_mtx); + auto b = gen_mtx(m, n); + auto x = gen_mtx(m, n); + auto d_b = gko::clone(exec, b); + auto d_x = gko::clone(exec, x); + + ref_solver->apply(b, x); + exec_solver->apply(d_b, d_x); + + GKO_ASSERT_MTX_NEAR(d_b, b, 0); + GKO_ASSERT_MTX_NEAR(d_x, x, r::value * 1e3); + } } From c8ffffc1cf62b35528a244a6218966eb5b7f83b8 Mon Sep 17 00:00:00 2001 From: nbeams <246972+nbeams@users.noreply.github.com> Date: Thu, 18 Jul 2024 19:09:30 +0000 Subject: [PATCH 115/448] Minor: formatting and comment clarity --- core/solver/gmres.cpp | 12 +++++++----- core/solver/gmres_kernels.hpp | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index f6fb254cf94..7a77988be98 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -154,8 +154,9 @@ void orthogonalize_mgs(matrix::Dense* hessenberg_iter, for (size_type i = 0; i <= restart_iter; i++) { // orthogonalize against krylov_bases(:, i): // hessenberg(i, restart_iter) = next_krylov' * krylov_bases(:, - // i) next_krylov -= hessenberg(i, restart_iter) * - // krylov_bases(:, i) + // i) + // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:, + // i) auto hessenberg_entry = hessenberg_iter->create_submatrix( span{0, 1}, span{i * num_rhs, (i + 1) * num_rhs}); auto krylov_basis = ::gko::detail::create_submatrix_helper( @@ -185,9 +186,10 @@ void finish_reduce(matrix::Dense* hessenberg_iter, auto exec = hessenberg_iter->get_executor(); const auto comm = next_krylov->get_communicator(); exec->synchronize(); - // hessenberg_iter is the size of all non-zeros for this iteration -- but we - // are not setting the last values for each rhs (values that would be below - // the diagonal in the "full" matrix. + // hessenberg_iter is the size of all non-zeros for this iteration, but we + // are not setting the last values for each rhs here. Values that would be + // below the diagonal in the "full" matrix are skipped, because they will + // be used to hold the norm of next_krylov for each rhs. auto hessenberg_reduce = hessenberg_iter->create_submatrix( span{0, 1}, span{0, num_rhs * (restart_iter + 1)}); if (experimental::mpi::requires_host_buffer(exec, comm)) { diff --git a/core/solver/gmres_kernels.hpp b/core/solver/gmres_kernels.hpp index f9fbe76279b..21bb5854816 100644 --- a/core/solver/gmres_kernels.hpp +++ b/core/solver/gmres_kernels.hpp @@ -44,6 +44,7 @@ namespace gmres { const matrix::Dense<_type>* next_krylov, \ matrix::Dense<_type>* hessenberg_col) + #define GKO_DECLARE_ALL_AS_TEMPLATES \ template \ GKO_DECLARE_GMRES_RESTART_KERNEL(ValueType); \ From 4c50f84e2bff4347ef6fb4b86dd85d34208e3d90 Mon Sep 17 00:00:00 2001 From: nbeams <246972+nbeams@users.noreply.github.com> Date: Fri, 19 Jul 2024 20:47:28 +0000 Subject: [PATCH 116/448] Reshape hessenberg_iter view to 'logical layout' (one column per rhs) for kernels that do not use the full Hessenberg matrix --- .../unified/solver/common_gmres_kernels.cpp | 29 +++++------ common/unified/solver/gmres_kernels.cpp | 6 ++- core/solver/gmres.cpp | 49 ++++++++++--------- reference/solver/common_gmres_kernels.cpp | 44 +++++++---------- reference/solver/gmres_kernels.cpp | 16 +++--- reference/test/solver/gmres_kernels.cpp | 38 ++++++++++---- test/solver/gmres_kernels.cpp | 10 ++-- 7 files changed, 102 insertions(+), 90 deletions(-) diff --git a/common/unified/solver/common_gmres_kernels.cpp b/common/unified/solver/common_gmres_kernels.cpp index 15637fe701e..679aebcfaa2 100644 --- a/common/unified/solver/common_gmres_kernels.cpp +++ b/common/unified/solver/common_gmres_kernels.cpp @@ -69,30 +69,28 @@ void hessenberg_qr(std::shared_ptr exec, exec, [] GKO_KERNEL(auto rhs, auto givens_sin, auto givens_cos, auto residual_norm, auto residual_norm_collection, - auto hessenberg_iter, auto iter, auto num_rhs, - auto final_iter_nums, auto stop_status) { + auto hessenberg_iter, auto iter, auto final_iter_nums, + auto stop_status) { using value_type = std::decay_t; if (stop_status[rhs].has_stopped()) { return; } // increment iteration count final_iter_nums[rhs]++; - auto hess_this = - hessenberg_iter(0, rhs); // hessenberg_iter(0, rhs); - auto hess_next = - hessenberg_iter(0, num_rhs + rhs); // hessenberg_iter(1, rhs); + auto hess_this = hessenberg_iter(0, rhs); + auto hess_next = hessenberg_iter(1, rhs); // apply previous Givens rotations to column for (decltype(iter) j = 0; j < iter; ++j) { // in here: hess_this = hessenberg_iter(j, rhs); // hess_next = hessenberg_iter(j+1, rhs); - hess_next = hessenberg_iter(0, (j + 1) * num_rhs + rhs); + hess_next = hessenberg_iter(j + 1, rhs); const auto gc = givens_cos(j, rhs); const auto gs = givens_sin(j, rhs); const auto out1 = gc * hess_this + gs * hess_next; const auto out2 = -conj(gs) * hess_this + conj(gc) * hess_next; - hessenberg_iter(0, j * num_rhs + rhs) = out1; - hessenberg_iter(0, (j + 1) * num_rhs + rhs) = hess_this = out2; - hess_next = hessenberg_iter(0, (j + 2) * num_rhs + rhs); + hessenberg_iter(j, rhs) = out1; + hessenberg_iter(j + 1, rhs) = hess_this = out2; + hess_next = hessenberg_iter(j + 2, rhs); } // hess_this is hessenberg_iter(iter, rhs) and // hess_next is hessenberg_iter(iter + 1, rhs) @@ -112,9 +110,8 @@ void hessenberg_qr(std::shared_ptr exec, givens_sin(iter, rhs) = gs = conj(hess_next) / hypotenuse; } // apply new Givens rotation to column - hessenberg_iter(0, iter * num_rhs + rhs) = - gc * hess_this + gs * hess_next; - hessenberg_iter(0, (iter + 1) * num_rhs + rhs) = zero(); + hessenberg_iter(iter, rhs) = gc * hess_this + gs * hess_next; + hessenberg_iter(iter + 1, rhs) = zero(); // apply new Givens rotation to RHS of least-squares problem const auto rnc_new = -conj(gs) * residual_norm_collection(iter, rhs); @@ -123,9 +120,9 @@ void hessenberg_qr(std::shared_ptr exec, gc * residual_norm_collection(iter, rhs); residual_norm(0, rhs) = abs(rnc_new); }, - residual_norm->get_size()[1], givens_sin, givens_cos, residual_norm, - residual_norm_collection, hessenberg_iter, iter, - residual_norm->get_size()[1], final_iter_nums, stop_status); + hessenberg_iter->get_size()[1], givens_sin, givens_cos, residual_norm, + residual_norm_collection, hessenberg_iter, iter, final_iter_nums, + stop_status); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( diff --git a/common/unified/solver/gmres_kernels.cpp b/common/unified/solver/gmres_kernels.cpp index c10dc2562e5..f24ae445edb 100644 --- a/common/unified/solver/gmres_kernels.cpp +++ b/common/unified/solver/gmres_kernels.cpp @@ -111,8 +111,10 @@ void multi_dot(std::shared_ptr exec, next_krylov(row, irhs); }, GKO_KERNEL_REDUCE_SUM(ValueType), hessenberg_col->get_values(), - gko::dim<2>{next_krylov->get_size()[0], - hessenberg_col->get_size()[1] - next_krylov->get_size()[1]}, + gko::dim<2>{ + next_krylov->get_size()[0], + hessenberg_col->get_size()[0] * hessenberg_col->get_size()[1] - + next_krylov->get_size()[1]}, krylov_bases, next_krylov, next_krylov->get_size()[1], next_krylov->get_size()[0]); } diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index 7a77988be98..d47eb4428ea 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -157,8 +157,8 @@ void orthogonalize_mgs(matrix::Dense* hessenberg_iter, // i) // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:, // i) - auto hessenberg_entry = hessenberg_iter->create_submatrix( - span{0, 1}, span{i * num_rhs, (i + 1) * num_rhs}); + auto hessenberg_entry = + hessenberg_iter->create_submatrix(span{i, i + 1}, span{0, num_rhs}); auto krylov_basis = ::gko::detail::create_submatrix_helper( krylov_bases, dim<2>{num_rows, num_rhs}, span{local_num_rows * i, local_num_rows * (i + 1)}, @@ -191,19 +191,18 @@ void finish_reduce(matrix::Dense* hessenberg_iter, // below the diagonal in the "full" matrix are skipped, because they will // be used to hold the norm of next_krylov for each rhs. auto hessenberg_reduce = hessenberg_iter->create_submatrix( - span{0, 1}, span{0, num_rhs * (restart_iter + 1)}); + span{0, restart_iter + 1}, span{0, num_rhs}); + int message_size = static_cast((restart_iter + 1) * num_rhs); if (experimental::mpi::requires_host_buffer(exec, comm)) { ::gko::detail::DenseCache host_reduction_buffer; host_reduction_buffer.init(exec->get_master(), hessenberg_reduce->get_size()); host_reduction_buffer->copy_from(hessenberg_reduce); comm.all_reduce(exec->get_master(), host_reduction_buffer->get_values(), - static_cast(hessenberg_reduce->get_size()[1]), - MPI_SUM); + message_size, MPI_SUM); hessenberg_reduce->copy_from(host_reduction_buffer.get()); } else { - comm.all_reduce(exec, hessenberg_reduce->get_values(), - static_cast(hessenberg_reduce->get_size()[1]), + comm.all_reduce(exec, hessenberg_reduce->get_values(), message_size, MPI_SUM); } } @@ -228,8 +227,8 @@ void orthogonalize_cgs(matrix::Dense* hessenberg_iter, for (size_type i = 0; i <= restart_iter; i++) { // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:, // i) - auto hessenberg_entry = hessenberg_iter->create_submatrix( - span{0, 1}, span{i * num_rhs, (i + 1) * num_rhs}); + auto hessenberg_entry = + hessenberg_iter->create_submatrix(span{i, i + 1}, span{0, num_rhs}); auto krylov_col = ::gko::detail::create_submatrix_helper( krylov_bases, dim<2>{num_rows, num_rhs}, span{local_num_rows * i, local_num_rows * (i + 1)}, @@ -260,8 +259,8 @@ void orthogonalize_cgs2(matrix::Dense* hessenberg_iter, for (size_type i = 0; i <= restart_iter; i++) { // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:, // i) - auto hessenberg_entry = hessenberg_iter->create_submatrix( - span{0, 1}, span{i * num_rhs, (i + 1) * num_rhs}); + auto hessenberg_entry = + hessenberg_iter->create_submatrix(span{i, i + 1}, span{0, num_rhs}); auto krylov_col = ::gko::detail::create_submatrix_helper( krylov_bases, dim<2>{num_rows, num_rhs}, span{local_num_rows * i, local_num_rows * (i + 1)}, @@ -270,7 +269,7 @@ void orthogonalize_cgs2(matrix::Dense* hessenberg_iter, } // Re-orthogonalize auto hessenberg_aux_iter = hessenberg_aux->create_submatrix( - span{0, 1}, span{0, (restart_iter + 2) * num_rhs}); + span{0, restart_iter + 2}, span{0, num_rhs}); exec->run(gmres::make_multi_dot( gko::detail::get_local(krylov_basis_small.get()), gko::detail::get_local(next_krylov), hessenberg_aux_iter.get())); @@ -280,8 +279,8 @@ void orthogonalize_cgs2(matrix::Dense* hessenberg_iter, for (size_type i = 0; i <= restart_iter; i++) { // next_krylov -= hessenberg(i, restart_iter) * krylov_bases(:, // i) - auto hessenberg_entry = hessenberg_aux->create_submatrix( - span{0, 1}, span{i * num_rhs, (i + 1) * num_rhs}); + auto hessenberg_entry = + hessenberg_aux->create_submatrix(span{i, i + 1}, span{0, num_rhs}); auto krylov_col = ::gko::detail::create_submatrix_helper( krylov_bases, dim<2>{num_rows, num_rhs}, span{local_num_rows * i, local_num_rows * (i + 1)}, @@ -353,10 +352,13 @@ void Gmres::apply_dense_impl(const VectorType* dense_b, // Krylov basis vector, for the (j % num_rhs)th RHS vector. auto hessenberg = this->template create_workspace_op( ws::hessenberg, dim<2>{krylov_dim, (krylov_dim + 1) * num_rhs}); + // Because the auxiliary Hessenberg workspace only ever stores one + // iteration of data at a time, we store it in the "logical" layout + // from the start. LocalVector* hessenberg_aux = nullptr; if (this->parameters_.orthog_method == gmres::orthog_method::cgs2) { hessenberg_aux = this->template create_workspace_op( - ws::hessenberg_aux, dim<2>{1, (krylov_dim + 1) * num_rhs}); + ws::hessenberg_aux, dim<2>{(krylov_dim + 1), num_rhs}); } auto givens_sin = this->template create_workspace_op( ws::givens_sin, dim<2>{krylov_dim, num_rhs}); @@ -506,12 +508,16 @@ void Gmres::apply_dense_impl(const VectorType* dense_b, this->get_preconditioner()->apply(this_krylov, preconditioned_krylov_vector); - // Create view of current "column" in the hessenberg matrix: + // Create view of current column in the hessenberg matrix: // hessenberg_iter = hessenberg(:, restart_iter), which - // is actually stored as a row, hessenberg(restart_iter, :) - auto hessenberg_iter = - hessenberg->create_submatrix(span{restart_iter, restart_iter + 1}, - span{0, num_rhs * (restart_iter + 2)}); + // is actually stored as a row, hessenberg(restart_iter, :), + // but we will reshape it for viewing in hessenberg_iter. + auto hessenberg_iter = LocalVector::create( + exec, dim<2>{restart_iter + 2, num_rhs}, + make_array_view(exec, (restart_iter + 2) * num_rhs, + hessenberg->get_values() + + restart_iter * hessenberg->get_size()[1]), + num_rhs); // Start of Arnoldi // next_krylov = A * preconditioned_krylov_vector @@ -537,8 +543,7 @@ void Gmres::apply_dense_impl(const VectorType* dense_b, // (stored in hessenberg(restart_iter, (restart_iter + 1) * num_rhs)) // next_krylov /= hessenberg(restart_iter+1, restart_iter) auto hessenberg_norm_entry = hessenberg_iter->create_submatrix( - span{0, 1}, - span{(restart_iter + 1) * num_rhs, (restart_iter + 2) * num_rhs}); + span{restart_iter + 1, restart_iter + 2}, span{0, num_rhs}); help_compute_norm::compute_next_krylov_norm_into_hessenberg( next_krylov.get(), hessenberg_norm_entry.get(), next_krylov_norm_tmp, reduction_tmp); diff --git a/reference/solver/common_gmres_kernels.cpp b/reference/solver/common_gmres_kernels.cpp index 122c224d5c1..4ba091e03ae 100644 --- a/reference/solver/common_gmres_kernels.cpp +++ b/reference/solver/common_gmres_kernels.cpp @@ -30,15 +30,14 @@ template void calculate_sin_and_cos(matrix::Dense* givens_sin, matrix::Dense* givens_cos, matrix::Dense* hessenberg_iter, - size_type iter, const size_type num_rhs, - const size_type rhs) + size_type iter, const size_type rhs) { - if (is_zero(hessenberg_iter->at(0, iter * num_rhs + rhs))) { + if (is_zero(hessenberg_iter->at(iter, rhs))) { givens_cos->at(iter, rhs) = zero(); givens_sin->at(iter, rhs) = one(); } else { - auto this_hess = hessenberg_iter->at(0, iter * num_rhs + rhs); - auto next_hess = hessenberg_iter->at(0, (iter + 1) * num_rhs + rhs); + auto this_hess = hessenberg_iter->at(iter, rhs); + auto next_hess = hessenberg_iter->at(iter + 1, rhs); const auto scale = abs(this_hess) + abs(next_hess); const auto hypotenuse = scale * sqrt(abs(this_hess / scale) * abs(this_hess / scale) + @@ -53,24 +52,19 @@ template void givens_rotation(matrix::Dense* givens_sin, matrix::Dense* givens_cos, matrix::Dense* hessenberg_iter, size_type iter, - const size_type num_rhs, const stopping_status* stop_status) { - for (size_type i = 0; i < num_rhs; ++i) { + for (size_type i = 0; i < hessenberg_iter->get_size()[1]; ++i) { if (stop_status[i].has_stopped()) { continue; } for (size_type j = 0; j < iter; ++j) { - auto temp = - givens_cos->at(j, i) * hessenberg_iter->at(0, j * num_rhs + i) + - givens_sin->at(j, i) * - hessenberg_iter->at(0, (j + 1) * num_rhs + i); - hessenberg_iter->at(0, (j + 1) * num_rhs + i) = - -conj(givens_sin->at(j, i)) * - hessenberg_iter->at(0, j * num_rhs + i) + - conj(givens_cos->at(j, i)) * - hessenberg_iter->at(0, (j + 1) * num_rhs + i); - hessenberg_iter->at(0, j * num_rhs + i) = temp; + auto temp = givens_cos->at(j, i) * hessenberg_iter->at(j, i) + + givens_sin->at(j, i) * hessenberg_iter->at(j + 1, i); + hessenberg_iter->at(j + 1, i) = + -conj(givens_sin->at(j, i)) * hessenberg_iter->at(j, i) + + conj(givens_cos->at(j, i)) * hessenberg_iter->at(j + 1, i); + hessenberg_iter->at(j, i) = temp; // temp = cos(j)*hessenberg(j) + // sin(j)*hessenberg(j+1) // hessenberg(j+1) = -conj(sin(j))*hessenberg(j) + @@ -78,15 +72,12 @@ void givens_rotation(matrix::Dense* givens_sin, // hessenberg(j) = temp; } - calculate_sin_and_cos(givens_sin, givens_cos, hessenberg_iter, iter, - num_rhs, i); + calculate_sin_and_cos(givens_sin, givens_cos, hessenberg_iter, iter, i); - hessenberg_iter->at(0, iter * num_rhs + i) = - givens_cos->at(iter, i) * - hessenberg_iter->at(0, iter * num_rhs + i) + - givens_sin->at(iter, i) * - hessenberg_iter->at(0, (iter + 1) * num_rhs + i); - hessenberg_iter->at(0, (iter + 1) * num_rhs + i) = zero(); + hessenberg_iter->at(iter, i) = + givens_cos->at(iter, i) * hessenberg_iter->at(iter, i) + + givens_sin->at(iter, i) * hessenberg_iter->at(iter + 1, i); + hessenberg_iter->at(iter + 1, i) = zero(); // hessenberg(iter) = cos(iter)*hessenberg(iter) + // sin(iter)*hessenberg(iter + 1) // hessenberg(iter+1) = 0 @@ -160,8 +151,7 @@ void hessenberg_qr(std::shared_ptr exec, } } - givens_rotation(givens_sin, givens_cos, hessenberg_iter, iter, - residual_norm->get_size()[1], stop_status); + givens_rotation(givens_sin, givens_cos, hessenberg_iter, iter, stop_status); calculate_next_residual_norm(givens_sin, givens_cos, residual_norm, residual_norm_collection, iter, stop_status); } diff --git a/reference/solver/gmres_kernels.cpp b/reference/solver/gmres_kernels.cpp index 4c482632353..a7f5a751a3b 100644 --- a/reference/solver/gmres_kernels.cpp +++ b/reference/solver/gmres_kernels.cpp @@ -79,14 +79,14 @@ void multi_dot(std::shared_ptr exec, { auto num_rhs = next_krylov->get_size()[1]; auto krylov_bases_rowoffset = next_krylov->get_size()[0]; - for (size_type i = 0; i < hessenberg_col->get_size()[1]; ++i) { - auto ivec = i / num_rhs; - auto irhs = i % num_rhs; - hessenberg_col->at(0, i) = zero(); - for (size_type j = 0; j < krylov_bases_rowoffset; ++j) { - hessenberg_col->at(0, i) += - krylov_bases->at(ivec * krylov_bases_rowoffset + j, irhs) * - next_krylov->at(j, irhs); + for (size_type i = 0; i < hessenberg_col->get_size()[0] - 1; ++i) { + for (size_type k = 0; k < num_rhs; ++k) { + hessenberg_col->at(i, k) = zero(); + for (size_type j = 0; j < krylov_bases_rowoffset; ++j) { + hessenberg_col->at(i, k) += + conj(krylov_bases->at(i * krylov_bases_rowoffset + j, k)) * + next_krylov->at(j, k); + } } } } diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp index bc877e0ed76..7bbb30fff11 100644 --- a/reference/test/solver/gmres_kernels.cpp +++ b/reference/test/solver/gmres_kernels.cpp @@ -227,12 +227,19 @@ TYPED_TEST(Gmres, KernelHessenbergQrIter0) this->small_final_iter_nums.get_data()[0] = 0; this->small_final_iter_nums.get_data()[1] = 0; + // Reshape into "hessenberg_iter" columns as done in Gmres + auto hessenberg_iter_rows = this->small_givens_sin->get_size()[0] + 1; + auto hessenberg_iter_cols = this->small_givens_sin->get_size()[1]; + auto hessenberg_reshape = Mtx::create( + this->exec, gko::dim<2>{hessenberg_iter_rows, hessenberg_iter_cols}, + make_array_view(this->exec, hessenberg_iter_rows * hessenberg_iter_cols, + this->small_hessenberg->get_values()), + hessenberg_iter_cols); gko::kernels::reference::common_gmres::hessenberg_qr( this->exec, this->small_givens_sin.get(), this->small_givens_cos.get(), this->small_residual_norm.get(), - this->small_residual_norm_collection.get(), - this->small_hessenberg.get(), iteration, - this->small_final_iter_nums.get_data(), + this->small_residual_norm_collection.get(), hessenberg_reshape.get(), + iteration, this->small_final_iter_nums.get_data(), this->small_stop.get_const_data()); ASSERT_EQ(this->small_final_iter_nums.get_data()[0], 1); @@ -272,12 +279,19 @@ TYPED_TEST(Gmres, KernelHessenbergQrIter1) this->small_final_iter_nums.get_data()[0] = 1; this->small_final_iter_nums.get_data()[1] = 1; + // Reshape into "hessenberg_iter" columns as done in Gmres + auto hessenberg_iter_rows = this->small_givens_sin->get_size()[0] + 1; + auto hessenberg_iter_cols = this->small_givens_sin->get_size()[1]; + auto hessenberg_reshape = Mtx::create( + this->exec, gko::dim<2>{hessenberg_iter_rows, hessenberg_iter_cols}, + make_array_view(this->exec, hessenberg_iter_rows * hessenberg_iter_cols, + this->small_hessenberg->get_values()), + hessenberg_iter_cols); gko::kernels::reference::common_gmres::hessenberg_qr( this->exec, this->small_givens_sin.get(), this->small_givens_cos.get(), this->small_residual_norm.get(), - this->small_residual_norm_collection.get(), - this->small_hessenberg.get(), iteration, - this->small_final_iter_nums.get_data(), + this->small_residual_norm_collection.get(), hessenberg_reshape.get(), + iteration, this->small_final_iter_nums.get_data(), this->small_stop.get_const_data()); ASSERT_EQ(this->small_final_iter_nums.get_data()[0], 2); @@ -372,9 +386,13 @@ TYPED_TEST(Gmres, KernelMultiDot) const T nan = std::numeric_limits>::quiet_NaN(); const auto restart = this->small_givens_sin->get_size()[0]; this->small_hessenberg->fill(gko::zero()); - auto hessenberg_iter = this->small_hessenberg->create_submatrix( - gko::span{0, 1}, - gko::span{0, (restart + 1) * this->small_x->get_size()[1]}); + // Reshape into "hessenberg_iter" columns as done in Gmres + auto hessenberg_iter = Mtx::create( + this->exec, gko::dim<2>{restart + 1, this->small_x->get_size()[1]}, + make_array_view(this->exec, + (restart + 1) * this->small_x->get_size()[1], + this->small_hessenberg->get_values()), + this->small_x->get_size()[1]); this->small_x = gko::initialize( // next_krylov {I{-1.0, 2.3}, I{-14.0, -22.0}, I{8.4, 14.2}}, this->exec); @@ -396,7 +414,7 @@ TYPED_TEST(Gmres, KernelMultiDot) hessenberg_iter.get()); GKO_ASSERT_MTX_NEAR(hessenberg_iter, - l({{-3.8, -48.6, -23.6, -65.1, -43.4, -81.6}}), + l({{-3.8, -48.6}, {-23.6, -65.1}, {0.0, 0.0}}), r::value); } diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp index fb2eab5c040..d4dcbf19318 100644 --- a/test/solver/gmres_kernels.cpp +++ b/test/solver/gmres_kernels.cpp @@ -78,7 +78,7 @@ class Gmres : public CommonTestFixture { gen_mtx(gko::solver::gmres_default_krylov_dim, (gko::solver::gmres_default_krylov_dim + 1) * nrhs); hessenberg_iter = - gen_mtx(1, (gko::solver::gmres_default_krylov_dim + 1) * nrhs); + gen_mtx(gko::solver::gmres_default_krylov_dim + 1, nrhs); residual = gen_mtx(m, nrhs); residual_norm = gen_mtx(1, nrhs); residual_norm_collection = @@ -308,11 +308,11 @@ TEST_F(Gmres, GmresKernelMultiDotIsEquivalentToRef) // solver's restart would be triggered, so it is only the final row of // the Hessenberg column(s) that we ignore. auto hessenberg_iter_small = hessenberg_iter->create_submatrix( - gko::span{0, 1}, - gko::span{0, gko::solver::gmres_default_krylov_dim * x->get_size()[1]}); + gko::span{0, gko::solver::gmres_default_krylov_dim + 1}, + gko::span{0, x->get_size()[1]}); auto d_hessenberg_iter_small = d_hessenberg_iter->create_submatrix( - gko::span{0, 1}, - gko::span{0, gko::solver::gmres_default_krylov_dim * x->get_size()[1]}); + gko::span{0, gko::solver::gmres_default_krylov_dim + 1}, + gko::span{0, x->get_size()[1]}); GKO_ASSERT_MTX_NEAR(d_hessenberg_iter_small, hessenberg_iter_small, r::value); } From 1344522314e8823f83c8995da7a6f53b18d9ffd1 Mon Sep 17 00:00:00 2001 From: nbeams <246972+nbeams@users.noreply.github.com> Date: Mon, 22 Jul 2024 23:24:49 +0000 Subject: [PATCH 117/448] gmres: minor spacing and namespace fixes --- core/solver/gmres.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index d47eb4428ea..dbad7d8d1d8 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -52,6 +52,7 @@ std::ostream& operator<<(std::ostream& stream, orthog_method orthog) return stream; } + } // namespace gmres @@ -142,7 +143,7 @@ struct help_compute_norm { } }; -namespace { + // Orthogonalization helper functions template void orthogonalize_mgs(matrix::Dense* hessenberg_iter, @@ -169,6 +170,7 @@ void orthogonalize_mgs(matrix::Dense* hessenberg_iter, } } + template void finish_reduce(matrix::Dense* hessenberg_iter, matrix::Dense* next_krylov, @@ -177,6 +179,7 @@ void finish_reduce(matrix::Dense* hessenberg_iter, return; } + #if GINKGO_BUILD_MPI template void finish_reduce(matrix::Dense* hessenberg_iter, @@ -208,6 +211,7 @@ void finish_reduce(matrix::Dense* hessenberg_iter, } #endif + template void orthogonalize_cgs(matrix::Dense* hessenberg_iter, VectorType* krylov_bases, VectorType* next_krylov, @@ -290,7 +294,7 @@ void orthogonalize_cgs2(matrix::Dense* hessenberg_iter, // Add both Hessenberg columns hessenberg_iter->add_scaled(one_op, hessenberg_aux_iter); } -} // anonymous namespace + template struct help_compute_norm template void Gmres::apply_dense_impl(const VectorType* dense_b, From 15995da4ccd746938fd6b889f027ce2cf0aed91e Mon Sep 17 00:00:00 2001 From: nbeams <246972+nbeams@users.noreply.github.com> Date: Mon, 22 Jul 2024 23:27:00 +0000 Subject: [PATCH 118/448] Simplify multi_dot kernel test --- test/solver/gmres_kernels.cpp | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp index d4dcbf19318..a084e17fbdc 100644 --- a/test/solver/gmres_kernels.cpp +++ b/test/solver/gmres_kernels.cpp @@ -301,19 +301,7 @@ TEST_F(Gmres, GmresKernelMultiDotIsEquivalentToRef) exec, d_krylov_basis.get(), d_next_krylov.get(), d_hessenberg_iter.get()); - // The multidot computation does not set the value below the diagonal - // in the Hessenberg matrix column(s), as that is done after the - // orthogonalization of the next basis vector. In this test, we - // are checking the column(s) created on the last iteration before the - // solver's restart would be triggered, so it is only the final row of - // the Hessenberg column(s) that we ignore. - auto hessenberg_iter_small = hessenberg_iter->create_submatrix( - gko::span{0, gko::solver::gmres_default_krylov_dim + 1}, - gko::span{0, x->get_size()[1]}); - auto d_hessenberg_iter_small = d_hessenberg_iter->create_submatrix( - gko::span{0, gko::solver::gmres_default_krylov_dim + 1}, - gko::span{0, x->get_size()[1]}); - GKO_ASSERT_MTX_NEAR(d_hessenberg_iter_small, hessenberg_iter_small, + GKO_ASSERT_MTX_NEAR(d_hessenberg_iter, hessenberg_iter, r::value); } From ec25ea3765c52f030413c1e00b287fe0151f5424 Mon Sep 17 00:00:00 2001 From: nbeams <246972+nbeams@users.noreply.github.com> Date: Mon, 5 Aug 2024 19:23:26 +0000 Subject: [PATCH 119/448] Rename orthog_method to ortho_method --- core/solver/gmres.cpp | 35 ++++++++++++------------- core/test/config/solver.cpp | 6 ++--- include/ginkgo/core/solver/gmres.hpp | 8 +++--- reference/test/solver/gmres_kernels.cpp | 10 +++---- test/mpi/solver/solver.cpp | 12 ++++----- test/solver/gmres_kernels.cpp | 12 ++++----- 6 files changed, 41 insertions(+), 42 deletions(-) diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index dbad7d8d1d8..e47714b2186 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -39,14 +39,14 @@ GKO_REGISTER_OPERATION(multi_dot, gmres::multi_dot); } // anonymous namespace -std::ostream& operator<<(std::ostream& stream, orthog_method orthog) +std::ostream& operator<<(std::ostream& stream, ortho_method ortho) { - switch (orthog) { - case orthog_method::mgs: + switch (ortho) { + case ortho_method::mgs: return stream << "mgs"; - case orthog_method::cgs: + case ortho_method::cgs: return stream << "cgs"; - case orthog_method::cgs2: + case ortho_method::cgs2: return stream << "cgs2"; } return stream; @@ -69,19 +69,19 @@ typename Gmres::parameters_type Gmres::parse( if (auto& obj = config.get("flexible")) { params.with_flexible(gko::config::get_value(obj)); } - if (auto& obj = config.get("orthog_method")) { + if (auto& obj = config.get("ortho_method")) { auto str = obj.get_string(); - gmres::orthog_method orthog; + gmres::ortho_method ortho; if (str == "mgs") { - orthog = gmres::orthog_method::mgs; + ortho = gmres::ortho_method::mgs; } else if (str == "cgs") { - orthog = gmres::orthog_method::cgs; + ortho = gmres::ortho_method::cgs; } else if (str == "cgs2") { - orthog = gmres::orthog_method::cgs2; + ortho = gmres::ortho_method::cgs2; } else { - GKO_INVALID_CONFIG_VALUE("orthog_method", str); + GKO_INVALID_CONFIG_VALUE("ortho_method", str); } - params.with_orthog_method(orthog); + params.with_ortho_method(ortho); } return params; } @@ -361,7 +361,7 @@ void Gmres::apply_dense_impl(const VectorType* dense_b, // iteration of data at a time, we store it in the "logical" layout // from the start. LocalVector* hessenberg_aux = nullptr; - if (this->parameters_.orthog_method == gmres::orthog_method::cgs2) { + if (this->parameters_.ortho_method == gmres::ortho_method::cgs2) { hessenberg_aux = this->template create_workspace_op( ws::hessenberg_aux, dim<2>{(krylov_dim + 1), num_rhs}); } @@ -528,17 +528,16 @@ void Gmres::apply_dense_impl(const VectorType* dense_b, // next_krylov = A * preconditioned_krylov_vector this->get_system_matrix()->apply(preconditioned_krylov_vector, next_krylov); - if (this->parameters_.orthog_method == gmres::orthog_method::mgs) { + if (this->parameters_.ortho_method == gmres::ortho_method::mgs) { orthogonalize_mgs(hessenberg_iter.get(), krylov_bases, next_krylov.get(), reduction_tmp, restart_iter, num_rows, num_rhs, local_num_rows); - } else if (this->parameters_.orthog_method == - gmres::orthog_method::cgs) { + } else if (this->parameters_.ortho_method == gmres::ortho_method::cgs) { orthogonalize_cgs(hessenberg_iter.get(), krylov_bases, next_krylov.get(), restart_iter, num_rows, num_rhs, local_num_rows); - } else if (this->parameters_.orthog_method == - gmres::orthog_method::cgs2) { + } else if (this->parameters_.ortho_method == + gmres::ortho_method::cgs2) { orthogonalize_cgs2(hessenberg_iter.get(), krylov_bases, next_krylov.get(), hessenberg_aux, one_op, restart_iter, num_rows, num_rhs, local_num_rows); diff --git a/core/test/config/solver.cpp b/core/test/config/solver.cpp index 78f1f7351f8..a170ebb1e04 100644 --- a/core/test/config/solver.cpp +++ b/core/test/config/solver.cpp @@ -289,8 +289,8 @@ struct Gmres param.with_krylov_dim(3u); config_map["flexible"] = pnode{true}; param.with_flexible(true); - config_map["orthog_method"] = pnode{"cgs"}; - param.with_orthog_method(gko::solver::gmres::orthog_method::cgs); + config_map["ortho_method"] = pnode{"cgs"}; + param.with_ortho_method(gko::solver::gmres::ortho_method::cgs); } template @@ -302,7 +302,7 @@ struct Gmres solver_config_test::template validate(result, answer); ASSERT_EQ(res_param.krylov_dim, ans_param.krylov_dim); ASSERT_EQ(res_param.flexible, ans_param.flexible); - ASSERT_EQ(res_param.orthog_method, ans_param.orthog_method); + ASSERT_EQ(res_param.ortho_method, ans_param.ortho_method); } }; diff --git a/include/ginkgo/core/solver/gmres.hpp b/include/ginkgo/core/solver/gmres.hpp index 308dadf5218..3ba3acf94bb 100644 --- a/include/ginkgo/core/solver/gmres.hpp +++ b/include/ginkgo/core/solver/gmres.hpp @@ -35,7 +35,7 @@ namespace gmres { /** * Set the orthogonalization method for the Krylov subspace. */ -enum class orthog_method { +enum class ortho_method { /** * Modified Gram-Schmidt (default) */ @@ -51,7 +51,7 @@ enum class orthog_method { }; /** Prints an orthogonalization method. */ -std::ostream& operator<<(std::ostream& stream, orthog_method orthog); +std::ostream& operator<<(std::ostream& stream, ortho_method ortho); } // namespace gmres @@ -118,8 +118,8 @@ class Gmres bool GKO_FACTORY_PARAMETER_SCALAR(flexible, false); /** Orthogonalization method */ - gmres::orthog_method GKO_FACTORY_PARAMETER_SCALAR( - orthog_method, gmres::orthog_method::mgs); + gmres::ortho_method GKO_FACTORY_PARAMETER_SCALAR( + ortho_method, gmres::ortho_method::mgs); }; GKO_ENABLE_LIN_OP_FACTORY(Gmres, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp index 7bbb30fff11..3f11b087bb7 100644 --- a/reference/test/solver/gmres_kernels.cpp +++ b/reference/test/solver/gmres_kernels.cpp @@ -754,17 +754,17 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem1WithRestart) TYPED_TEST(Gmres, SolvesWithPreconditioner) { - using gko::solver::gmres::orthog_method; + using gko::solver::gmres::ortho_method; using Mtx = typename TestFixture::Mtx; using Solver = typename TestFixture::Solver; using value_type = typename TestFixture::value_type; - for (auto orthog : - {orthog_method::mgs, orthog_method::cgs, orthog_method::cgs2}) { - SCOPED_TRACE(orthog); + for (auto ortho : + {ortho_method::mgs, ortho_method::cgs, ortho_method::cgs2}) { + SCOPED_TRACE(ortho); auto gmres_factory_preconditioner = Solver::build() - .with_orthog_method(orthog) + .with_ortho_method(ortho) .with_criteria( gko::stop::Iteration::build().with_max_iters(100u), gko::stop::ResidualNorm::build() diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp index aaf61cb47ea..be9f6865c86 100644 --- a/test/mpi/solver/solver.cpp +++ b/test/mpi/solver/solver.cpp @@ -195,14 +195,14 @@ struct Ir : SimpleSolverTest> { }; -template +template struct Gmres : SimpleSolverTest> { static typename solver_type::parameters_type build( std::shared_ptr exec) { return SimpleSolverTest>::build( std::move(exec)) - .with_orthog_method(orthog) + .with_ortho_method(ortho) .with_krylov_dim(dimension); } }; @@ -532,10 +532,10 @@ class Solver : public CommonMpiTestFixture { using SolverTypes = ::testing::Types, Gcr<100u>, - Gmres<10u, gko::solver::gmres::orthog_method::mgs>, - Gmres<10u, gko::solver::gmres::orthog_method::cgs>, - Gmres<10u, gko::solver::gmres::orthog_method::cgs2>, - Gmres<100u, gko::solver::gmres::orthog_method::mgs>>; + Gmres<10u, gko::solver::gmres::ortho_method::mgs>, + Gmres<10u, gko::solver::gmres::ortho_method::cgs>, + Gmres<10u, gko::solver::gmres::ortho_method::cgs2>, + Gmres<100u, gko::solver::gmres::ortho_method::mgs>>; TYPED_TEST_SUITE(Solver, SolverTypes, TypenameNameGenerator); diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp index a084e17fbdc..72cbc83b002 100644 --- a/test/solver/gmres_kernels.cpp +++ b/test/solver/gmres_kernels.cpp @@ -327,18 +327,18 @@ TEST_F(Gmres, GmresApplyOneRHSIsEquivalentToRef) TEST_F(Gmres, GmresApplyMultipleRHSIsEquivalentToRef) { - using gko::solver::gmres::orthog_method; + using gko::solver::gmres::ortho_method; auto base_params = gko::clone(ref, ref_gmres_factory)->get_parameters(); - for (auto orthog : - {orthog_method::mgs, orthog_method::cgs, orthog_method::cgs2}) { - SCOPED_TRACE(orthog); + for (auto ortho : + {ortho_method::mgs, ortho_method::cgs, ortho_method::cgs2}) { + SCOPED_TRACE(ortho); int m = 123; int n = 5; auto ref_solver = - base_params.with_orthog_method(orthog).on(ref)->generate(mtx); + base_params.with_ortho_method(ortho).on(ref)->generate(mtx); auto exec_solver = - base_params.with_orthog_method(orthog).on(exec)->generate(d_mtx); + base_params.with_ortho_method(ortho).on(exec)->generate(d_mtx); auto b = gen_mtx(m, n); auto x = gen_mtx(m, n); auto d_b = gko::clone(exec, b); From f47f94ee3ab9d80a79e63f085fb422b68c4d1c6e Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 19 Jun 2024 09:33:05 +0200 Subject: [PATCH 120/448] [core] reading mm-files discards extra characters in row --- core/base/mtx_io.cpp | 7 ++++++ core/test/base/mtx_io.cpp | 50 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp index c264a073f31..5851135607e 100644 --- a/core/base/mtx_io.cpp +++ b/core/base/mtx_io.cpp @@ -35,6 +35,9 @@ namespace { } +constexpr auto max_streamsize = std::numeric_limits::max(); + + /** * The mtx_io class provides the functionality of reading and writing matrix * market format files. @@ -514,6 +517,8 @@ class mtx_io { GKO_CHECK_STREAM(content, "error when reading matrix entry " + std::to_string(i)); modifier->insert_entry(row - 1, col - 1, entry, data); + content.ignore(max_streamsize, + '\n'); // discards rest of the line } return data; } @@ -582,6 +587,8 @@ class mtx_io { std::to_string(row) + " ," + std::to_string(col)); modifier->insert_entry(row, col, entry, data); + content.ignore(max_streamsize, + '\n'); // discards rest of the line } } return data; diff --git a/core/test/base/mtx_io.cpp b/core/test/base/mtx_io.cpp index 66b6766b2d3..8ac1ced0e50 100644 --- a/core/test/base/mtx_io.cpp +++ b/core/test/base/mtx_io.cpp @@ -231,6 +231,32 @@ TEST(MtxReader, ReadsDenseComplexFloatMtxWith64Index) } +TEST(MtxReader, ReadsDenseIgnoresExtraCharactersInRow) +{ + using tpl = gko::matrix_data::nonzero_type; + std::istringstream iss( + "%%MatrixMarket matrix array real general\n" + "2 3 -77\n" + "1.0\n" + "0.0 58\n" + "3.0\n" + "5.0\n" + "2.0\n" + "0.0\n"); + + auto data = gko::read_raw(iss); + + ASSERT_EQ(data.size, gko::dim<2>(2, 3)); + auto& v = data.nonzeros; + ASSERT_EQ(v[0], tpl(0, 0, 1.0)); + ASSERT_EQ(v[1], tpl(0, 1, 3.0)); + ASSERT_EQ(v[2], tpl(0, 2, 2.0)); + ASSERT_EQ(v[3], tpl(1, 0, 0.0)); + ASSERT_EQ(v[4], tpl(1, 1, 5.0)); + ASSERT_EQ(v[5], tpl(1, 2, 0.0)); +} + + TEST(MtxReader, ReadsSparseRealMtx) { using tpl = gko::matrix_data::nonzero_type; @@ -385,7 +411,29 @@ TEST(MtxReader, ReadsSparseComplexHermitianMtx) } -TEST(MtxReader, ReadIgnoresExtraCharacters) +TEST(MtxReader, ReadsSparseIgnoresExtraCharactersInRow) +{ + using tpl = gko::matrix_data::nonzero_type; + std::istringstream iss( + "%%MatrixMarket matrix coordinate real general\n" + "2 3 4 abc\n" + "1 1 1.0 some value\n" + "2 2 5.0 who knows?\n" + "1 2 3.0\n" + "1 3 2.0\n"); + + auto data = gko::read_raw(iss); + + ASSERT_EQ(data.size, gko::dim<2>(2, 3)); + auto& v = data.nonzeros; + ASSERT_EQ(v[0], tpl(0, 0, 1.0)); + ASSERT_EQ(v[1], tpl(0, 1, 3.0)); + ASSERT_EQ(v[2], tpl(0, 2, 2.0)); + ASSERT_EQ(v[3], tpl(1, 1, 5.0)); +} + + +TEST(MtxReader, ReadHeaderIgnoresExtraCharacters) { using tpl = gko::matrix_data::nonzero_type; std::istringstream iss( From 8d84ccfbeee0aae15c252a5956171e57ce411ad2 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 13 Aug 2024 11:57:49 +0200 Subject: [PATCH 121/448] review updates: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - formatting - fix error message Co-authored-by: Thomas Grützmacher Co-authored-by: Yu-Hsiang M. Tsai --- core/base/mtx_io.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp index 5851135607e..33c3b07d487 100644 --- a/core/base/mtx_io.cpp +++ b/core/base/mtx_io.cpp @@ -517,8 +517,8 @@ class mtx_io { GKO_CHECK_STREAM(content, "error when reading matrix entry " + std::to_string(i)); modifier->insert_entry(row - 1, col - 1, entry, data); - content.ignore(max_streamsize, - '\n'); // discards rest of the line + // discards rest of the line + content.ignore(max_streamsize, '\n'); } return data; } @@ -574,7 +574,7 @@ class mtx_io { size_type num_cols{}; GKO_CHECK_STREAM( header >> num_rows >> num_cols, - "error when determining matrix size, expected: rows cols nnz"); + "error when determining matrix size, expected: rows cols"); matrix_data data(dim<2>{num_rows, num_cols}); data.nonzeros.reserve(modifier->get_reservation_size( num_rows, num_cols, num_rows * num_cols)); @@ -587,8 +587,8 @@ class mtx_io { std::to_string(row) + " ," + std::to_string(col)); modifier->insert_entry(row, col, entry, data); - content.ignore(max_streamsize, - '\n'); // discards rest of the line + // discards rest of the line + content.ignore(max_streamsize, '\n'); } } return data; From a328701aef8f5344f5c8efaf1caacda5418f721c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 11 Aug 2024 14:50:39 +0200 Subject: [PATCH 122/448] fix uninitialized array alignment --- .../components/uninitialized_array.hpp | 43 ++++++++++++++++--- .../cuda_hip/matrix/csr_kernels.template.cpp | 2 +- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/common/cuda_hip/components/uninitialized_array.hpp b/common/cuda_hip/components/uninitialized_array.hpp index 44fcbfd0d85..8929476fbd6 100644 --- a/common/cuda_hip/components/uninitialized_array.hpp +++ b/common/cuda_hip/components/uninitialized_array.hpp @@ -8,6 +8,8 @@ #include +#include "common/cuda_hip/base/thrust.hpp" + namespace gko { namespace kernels { @@ -34,7 +36,7 @@ class uninitialized_array { */ constexpr GKO_ATTRIBUTES operator const ValueType*() const noexcept { - return &(*this)[0]; + return data_; } /** @@ -43,7 +45,7 @@ class uninitialized_array { * * @return the non-const pointer to the first entry of the array. */ - GKO_ATTRIBUTES operator ValueType*() noexcept { return &(*this)[0]; } + GKO_ATTRIBUTES operator ValueType*() noexcept { return data_; } /** * constexpr array access operator. @@ -56,7 +58,7 @@ class uninitialized_array { constexpr GKO_ATTRIBUTES const ValueType& operator[]( size_type pos) const noexcept { - return reinterpret_cast(data_)[pos]; + return data_[pos]; } /** @@ -69,11 +71,42 @@ class uninitialized_array { */ GKO_ATTRIBUTES ValueType& operator[](size_type pos) noexcept { - return reinterpret_cast(data_)[pos]; + return data_[pos]; + } + +private: + ValueType data_[size]; +}; + + +template +class uninitialized_array, size> { +public: + constexpr GKO_ATTRIBUTES operator const thrust::complex*() + const noexcept + { + return &(*this)[0]; + } + + GKO_ATTRIBUTES operator thrust::complex*() noexcept + { + return &(*this)[0]; + } + + constexpr GKO_ATTRIBUTES const thrust::complex& operator[]( + size_type pos) const noexcept + { + return reinterpret_cast*>(data_)[pos]; + } + + GKO_ATTRIBUTES thrust::complex& operator[]( + size_type pos) noexcept + { + return reinterpret_cast*>(data_)[pos]; } private: - unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size]; + ValueType data_[2 * size]; }; diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp index f17cf1548fe..757e689a777 100644 --- a/common/cuda_hip/matrix/csr_kernels.template.cpp +++ b/common/cuda_hip/matrix/csr_kernels.template.cpp @@ -335,7 +335,7 @@ __device__ void merge_path_reduce(const IndexType nwarps, } } } - __shared__ uninitialized_array tmp_ind; + __shared__ IndexType tmp_ind[spmv_block_size]; __shared__ uninitialized_array tmp_val; tmp_val[threadIdx.x] = value; tmp_ind[threadIdx.x] = row; From dc49b315d7451f65208104b9b37b4433a5066bca Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 12 Aug 2024 16:38:13 +0200 Subject: [PATCH 123/448] fix NaN handling on Windows MSVC doesn't treat the is_nan properly, so we do a byte comparison instead --- cuda/solver/common_trs_kernels.cuh | 13 +++--- include/ginkgo/core/base/math.hpp | 63 +++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 8 deletions(-) diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 7cedf2fbd2e..31ba6f0c19f 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -399,11 +399,12 @@ __global__ void sptrsv_naive_caching_kernel( ValueType val{}; if (shmem_possible) { const auto dependency_shid = dependency_gid % default_block_size; - while (is_nan(val = load_relaxed_shared(x_s + dependency_shid))) { + while (is_nan_exact( + val = load_relaxed_shared(x_s + dependency_shid))) { } } else { - while ( - is_nan(val = load_relaxed(x + dependency * x_stride + rhs))) { + while (is_nan_exact( + val = load_relaxed(x + dependency * x_stride + rhs))) { } } @@ -418,7 +419,7 @@ __global__ void sptrsv_naive_caching_kernel( store_relaxed(x + row * x_stride + rhs, r); // This check to ensure no infinite loops happen. - if (is_nan(r)) { + if (is_nan_exact(r)) { store_relaxed_shared(x_s + self_shid, zero()); store_relaxed(x + row * x_stride + rhs, zero()); *nan_produced = true; @@ -460,7 +461,7 @@ __global__ void sptrsv_naive_legacy_kernel( auto col = colidxs[j]; while (j != row_end) { auto x_val = load_relaxed(x + col * x_stride + rhs); - while (!is_nan(x_val)) { + while (!is_nan_exact(x_val)) { sum += vals[j] * x_val; j += row_step; col = colidxs[j]; @@ -478,7 +479,7 @@ __global__ void sptrsv_naive_legacy_kernel( // after we encountered the diagonal, we are done // this also skips entries outside the triangle j = row_end; - if (is_nan(r)) { + if (is_nan_exact(r)) { store_relaxed(x + row * x_stride + rhs, zero()); *nan_produced = true; } diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index f7b3b35c3f6..128712f0974 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -102,6 +103,21 @@ using std::sqrt; namespace detail { +/** Returns an unsigned type matching the size of the given float type. */ +template +struct float_to_bytes_impl {}; + +template <> +struct float_to_bytes_impl { + using type = uint64; +}; + +template <> +struct float_to_bytes_impl { + using type = uint32; +}; + + /** * Keep the same data type if it is not complex. */ @@ -1223,7 +1239,8 @@ template GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> is_nan(const T& value) { - return std::isnan(value); + using std::isnan; + return isnan(value); } @@ -1240,7 +1257,7 @@ template GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> is_nan( const T& value) { - return std::isnan(value.real()) || std::isnan(value.imag()); + return is_nan(value.real()) || is_nan(value.imag()); } @@ -1274,6 +1291,48 @@ nan() } +/** + * Checks if a floating point number is a quiet NaN (gko::nan()). + * + * @tparam T type of the value to check + * + * @param value value to check + * + * @return `true` if the value is bitwise equal to gko::nan(). + */ +template +GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> +is_nan_exact(const T& value) +{ + using type = typename detail::float_to_bytes_impl::type; + type value_bytes{}; + type nan_bytes{}; + auto nan_value = nan(); + using std::memcpy; + memcpy(&value_bytes, &value, sizeof(value)); + memcpy(&nan_bytes, &nan_value, sizeof(value)); + return value_bytes == nan_bytes; +} + + +/** + * Checks if any component of a complex value is a quiet NaN (gko::nan). + * + * @tparam T complex type of the value to check + * + * @param value complex value to check + * + * @return `true` if any component of the complex number fulfills + * is_nan_exact(component). + */ +template +GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> +is_nan_exact(const T& value) +{ + return is_nan_exact(value.real()) || is_nan_exact(value.imag()); +} + + } // namespace gko From ec41ac3c984dd4e41584872c76e5f52ae06f555f Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 12 Aug 2024 16:53:40 +0200 Subject: [PATCH 124/448] deprecate is_nan --- include/ginkgo/core/base/math.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 128712f0974..034ca6cbd52 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -1236,8 +1236,9 @@ GKO_INLINE GKO_ATTRIBUTES T safe_divide(T a, T b) * @return `true` if the value is NaN. */ template -GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> -is_nan(const T& value) +GKO_DEPRECATED("is_nan can't be used safely on the device (MSVC+CUDA)") +GKO_INLINE GKO_ATTRIBUTES + std::enable_if_t::value, bool> is_nan(const T& value) { using std::isnan; return isnan(value); @@ -1254,6 +1255,7 @@ is_nan(const T& value) * @return `true` if any component of the given value is NaN. */ template +GKO_DEPRECATED("is_nan can't be used safely on the device (MSVC+CUDA)") GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> is_nan( const T& value) { From a2c0cef2fda4c3a2b0b833fcc85e7c91509be401 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 12 Aug 2024 16:53:52 +0200 Subject: [PATCH 125/448] run tests for Windows CUDA --- .gitlab-ci.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2f8e3a892a5..226a10f4cea 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -486,8 +486,7 @@ build/windows-cuda/release/shared: - mkdir install - cmake -B build -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_CUDA=ON "-DCMAKE_INSTALL_PREFIX=$pwd\install" . - cmake --build build --config Release -j16 -# we disable these tests until the triangular solver issues are resolved -# - ctest --test-dir build -C Release --no-tests=error --output-on-failure + - ctest --test-dir build -C Release --no-tests=error --output-on-failure - $env:PATH+=";$pwd/install/bin" - cmake --install build --config Release - cmake --build build --target test_install --config Release From f2f8617f66d3e3363564a3172c137fcaef15cac8 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 12 Aug 2024 18:05:38 +0200 Subject: [PATCH 126/448] review updates and fixes --- include/ginkgo/core/base/math.hpp | 8 ++++++-- test/solver/gcr_kernels.cpp | 2 +- test/solver/lower_trs_kernels.cpp | 2 +- test/solver/upper_trs_kernels.cpp | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 034ca6cbd52..f32f47eda2f 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -1236,7 +1236,9 @@ GKO_INLINE GKO_ATTRIBUTES T safe_divide(T a, T b) * @return `true` if the value is NaN. */ template -GKO_DEPRECATED("is_nan can't be used safely on the device (MSVC+CUDA)") +GKO_DEPRECATED( + "is_nan can't be used safely on the device (MSVC+CUDA), and will thus be " + "removed in a future release, without replacement") GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> is_nan(const T& value) { @@ -1255,7 +1257,9 @@ GKO_INLINE GKO_ATTRIBUTES * @return `true` if any component of the given value is NaN. */ template -GKO_DEPRECATED("is_nan can't be used safely on the device (MSVC+CUDA)") +GKO_DEPRECATED( + "is_nan can't be used safely on the device (MSVC+CUDA), and will thus be " + "removed in a future release, without replacement") GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> is_nan( const T& value) { diff --git a/test/solver/gcr_kernels.cpp b/test/solver/gcr_kernels.cpp index eb3f5c6df93..3095475538d 100644 --- a/test/solver/gcr_kernels.cpp +++ b/test/solver/gcr_kernels.cpp @@ -222,7 +222,7 @@ TEST_F(Gcr, GcrApplyOneRHSIsEquivalentToRef) exec_solver->apply(d_b.get(), d_x.get()); GKO_ASSERT_MTX_NEAR(d_b, b, 0); - GKO_ASSERT_MTX_NEAR(d_x, x, r::value * 1e2); + GKO_ASSERT_MTX_NEAR(d_x, x, r::value * 1e3); } diff --git a/test/solver/lower_trs_kernels.cpp b/test/solver/lower_trs_kernels.cpp index b838c1df14b..9bfea0a22d0 100644 --- a/test/solver/lower_trs_kernels.cpp +++ b/test/solver/lower_trs_kernels.cpp @@ -152,7 +152,7 @@ TEST_F(LowerTrs, ApplyTriangularDenseMtxIsEquivalentToRef) solver->apply(b, x); d_solver->apply(db, dx); - GKO_ASSERT_MTX_NEAR(dx, x, 1e-14); + GKO_ASSERT_MTX_NEAR(dx, x, 1e-13); } diff --git a/test/solver/upper_trs_kernels.cpp b/test/solver/upper_trs_kernels.cpp index 6825d9f6c3b..c62dfa7c5de 100644 --- a/test/solver/upper_trs_kernels.cpp +++ b/test/solver/upper_trs_kernels.cpp @@ -152,7 +152,7 @@ TEST_F(UpperTrs, ApplyTriangularDenseMtxIsEquivalentToRef) solver->apply(b, x); d_solver->apply(db, dx); - GKO_ASSERT_MTX_NEAR(dx, x, 1e-14); + GKO_ASSERT_MTX_NEAR(dx, x, 1e-13); } From 31a8e692e22c0deccfcba6cf615336bd4a242690 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 12 Aug 2024 22:17:37 +0200 Subject: [PATCH 127/448] review updates - move is_nan_exact to internal code - rename to float_to_uint_impl - increase tolerance for additional test --- cuda/solver/common_trs_kernels.cuh | 41 +++++++++++++++++++++ include/ginkgo/core/base/math.hpp | 58 ------------------------------ test/solver/lower_trs_kernels.cpp | 2 +- 3 files changed, 42 insertions(+), 59 deletions(-) diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 31ba6f0c19f..7a3712c0390 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -6,6 +6,7 @@ #define GKO_CUDA_SOLVER_COMMON_TRS_KERNELS_CUH_ +#include #include #include #include @@ -342,6 +343,46 @@ constexpr int default_block_size = 512; constexpr int fallback_block_size = 32; +/** Returns an unsigned type matching the size of the given float type. */ +template +struct float_to_unsigned_impl {}; + +template <> +struct float_to_unsigned_impl { + using type = uint64; +}; + +template <> +struct float_to_unsigned_impl { + using type = uint32; +}; + + +/** Checks if a floating point number is a quiet NaN (gko::nan()). */ +template +GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> +is_nan_exact(const T& value) +{ + using type = typename float_to_unsigned_impl::type; + type value_bytes{}; + type nan_bytes{}; + auto nan_value = nan(); + using std::memcpy; + memcpy(&value_bytes, &value, sizeof(value)); + memcpy(&nan_bytes, &nan_value, sizeof(value)); + return value_bytes == nan_bytes; +} + + +/** Checks if any component of a complex value is a quiet NaN (gko::nan). */ +template +GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> +is_nan_exact(const T& value) +{ + return is_nan_exact(value.real()) || is_nan_exact(value.imag()); +} + + template __global__ void sptrsv_naive_caching_kernel( const IndexType* const rowptrs, const IndexType* const colidxs, diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index f32f47eda2f..f6847743717 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -103,21 +102,6 @@ using std::sqrt; namespace detail { -/** Returns an unsigned type matching the size of the given float type. */ -template -struct float_to_bytes_impl {}; - -template <> -struct float_to_bytes_impl { - using type = uint64; -}; - -template <> -struct float_to_bytes_impl { - using type = uint32; -}; - - /** * Keep the same data type if it is not complex. */ @@ -1297,48 +1281,6 @@ nan() } -/** - * Checks if a floating point number is a quiet NaN (gko::nan()). - * - * @tparam T type of the value to check - * - * @param value value to check - * - * @return `true` if the value is bitwise equal to gko::nan(). - */ -template -GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> -is_nan_exact(const T& value) -{ - using type = typename detail::float_to_bytes_impl::type; - type value_bytes{}; - type nan_bytes{}; - auto nan_value = nan(); - using std::memcpy; - memcpy(&value_bytes, &value, sizeof(value)); - memcpy(&nan_bytes, &nan_value, sizeof(value)); - return value_bytes == nan_bytes; -} - - -/** - * Checks if any component of a complex value is a quiet NaN (gko::nan). - * - * @tparam T complex type of the value to check - * - * @param value complex value to check - * - * @return `true` if any component of the complex number fulfills - * is_nan_exact(component). - */ -template -GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> -is_nan_exact(const T& value) -{ - return is_nan_exact(value.real()) || is_nan_exact(value.imag()); -} - - } // namespace gko diff --git a/test/solver/lower_trs_kernels.cpp b/test/solver/lower_trs_kernels.cpp index 9bfea0a22d0..da55f6153cc 100644 --- a/test/solver/lower_trs_kernels.cpp +++ b/test/solver/lower_trs_kernels.cpp @@ -417,7 +417,7 @@ TEST_F(LowerTrs, ClassicalApplyTriangularDenseMtxIsEquivalentToRef) solver->apply(b, x); d_solver->apply(db, dx); - GKO_ASSERT_MTX_NEAR(dx, x, 1e-14); + GKO_ASSERT_MTX_NEAR(dx, x, 1e-13); } From a569123968f5bcf08e5de72fae5a2285eedb6106 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 13 Aug 2024 12:54:46 +0200 Subject: [PATCH 128/448] fix library location for sparselib benchmark linops For things to work in Windows, the shared libraries need to be in the working directory or PATH --- benchmark/CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index de6e74d464c..e2479e02344 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -22,6 +22,7 @@ function(ginkgo_benchmark_cusparse_linops type def) target_compile_definitions(cusparse_linops_${type} PUBLIC ${def}) target_compile_definitions(cusparse_linops_${type} PRIVATE GKO_COMPILING_CUDA) target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse) + ginkgo_compile_features(cusparse_linops_${type}) endfunction() function(ginkgo_benchmark_hipsparse_linops type def) @@ -31,6 +32,7 @@ function(ginkgo_benchmark_hipsparse_linops type def) target_compile_definitions(hipsparse_linops_${type} PRIVATE GKO_COMPILING_HIP) target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS}) target_link_libraries(hipsparse_linops_${type} Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES}) + ginkgo_compile_features(hipsparse_linops_${type}) endfunction() function(ginkgo_benchmark_onemkl_linops type def) @@ -38,6 +40,7 @@ function(ginkgo_benchmark_onemkl_linops type def) # make the dependency public to catch issues target_compile_definitions(onemkl_linops_${type} PUBLIC ${def}) target_link_libraries(onemkl_linops_${type} PRIVATE Ginkgo::ginkgo MKL::MKL_DPCPP) + ginkgo_compile_features(onemkl_linops_${type}) endfunction() @@ -116,6 +119,7 @@ if (GINKGO_BUILD_CUDA) ginkgo_benchmark_cusparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) add_library(cuda_timer utils/cuda_timer.cpp) target_link_libraries(cuda_timer ginkgo CUDA::cudart) + ginkgo_compile_features(cuda_timer) endif() if (GINKGO_BUILD_HIP) ginkgo_benchmark_hipsparse_linops(d GKO_BENCHMARK_USE_DOUBLE_PRECISION) @@ -125,6 +129,7 @@ if (GINKGO_BUILD_HIP) set_source_files_properties(utils/hip_timer.hip.cpp PROPERTIES LANGUAGE HIP) add_library(hip_timer utils/hip_timer.hip.cpp) target_link_libraries(hip_timer ginkgo) + ginkgo_compile_features(hip_timer) endif() if (GINKGO_BUILD_SYCL) @@ -136,11 +141,13 @@ if (GINKGO_BUILD_SYCL) target_compile_options(dpcpp_timer PRIVATE ${GINKGO_DPCPP_FLAGS}) gko_add_sycl_to_target(TARGET dpcpp_timer SOURCES utils/dpcpp_timer.dp.cpp) target_link_libraries(dpcpp_timer ginkgo) + ginkgo_compile_features(dpcpp_timer) endif() if (GINKGO_BUILD_MPI) add_library(mpi_timer ${Ginkgo_SOURCE_DIR}/benchmark/utils/mpi_timer.cpp) target_link_libraries(mpi_timer ginkgo) + ginkgo_compile_features(mpi_timer) endif() add_subdirectory(blas) From b9d5224bbdbb54486748dbd3e9dd7ae3289a0601 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 13 Aug 2024 12:55:00 +0200 Subject: [PATCH 129/448] catch invalid JSON in test framework --- benchmark/test/test_framework.py.in | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in index 48f3ca608b2..9294b2f02ec 100644 --- a/benchmark/test/test_framework.py.in +++ b/benchmark/test/test_framework.py.in @@ -90,9 +90,12 @@ def sanitize_json_text(input: str) -> List[str]: and pretty-printed to replace the original JSON input. """ - result = json.dumps(sanitize_json(json.loads(input)), indent=4) - # json.dumps doesn't add a trailing newline - return result.splitlines() + [""] + try: + result = json.dumps(sanitize_json(json.loads(input)), indent=4) + # json.dumps doesn't add a trailing newline + return result.splitlines() + [""] + except Exception as e: + return f"Error: {str(e)}" def sanitize_text( From 4e92e81b795bf5a106021ddff87634e9a30e207d Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 13 Aug 2024 14:41:24 +0200 Subject: [PATCH 130/448] review updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Thomas Grützmacher --- cuda/solver/common_trs_kernels.cuh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 7a3712c0390..291c842325f 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -358,7 +358,10 @@ struct float_to_unsigned_impl { }; -/** Checks if a floating point number is a quiet NaN (gko::nan()). */ +/** + * Checks if a floating point number representation matches the representation + * of the quiet NaN with value gko::nan() exactly. + */ template GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> is_nan_exact(const T& value) @@ -374,7 +377,10 @@ is_nan_exact(const T& value) } -/** Checks if any component of a complex value is a quiet NaN (gko::nan). */ +/** + * Checks if any component of the complex value matches the quiet NaN with + * value gko::nan() exactly. + */ template GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> is_nan_exact(const T& value) From 495a1ebba0a01057b366da29d99fa4b26097a0f5 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 11 Jul 2024 11:20:57 +0200 Subject: [PATCH 131/448] Add cmake flag and instantiate only one by default --- CMakeLists.txt | 1 + core/solver/batch_bicgstab_kernels.hpp | 10 +++++++++- core/solver/batch_cg_kernels.hpp | 10 +++++++++- cuda/solver/batch_bicgstab_kernels.cu | 8 ++++++++ cuda/solver/batch_cg_kernels.cu | 8 ++++++++ dpcpp/solver/batch_bicgstab_kernels.dp.cpp | 8 ++++++++ dpcpp/solver/batch_cg_kernels.dp.cpp | 8 ++++++++ hip/solver/batch_bicgstab_kernels.hip.cpp | 8 ++++++++ hip/solver/batch_cg_kernels.hip.cpp | 8 ++++++++ include/ginkgo/config.hpp.in | 4 ++++ 10 files changed, 71 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 21832c98592..f60500b4cc9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,6 +55,7 @@ option(GINKGO_HIP_AMD_UNSAFE_ATOMIC "Compiler uses unsafe floating point atomic option(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS "Split template instantiations for slow-to-compile files. This improves parallel build performance" ON) mark_as_advanced(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS) option(GINKGO_JACOBI_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA Jacobi algorithm" OFF) +option(GINKGO_BATCHED_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA/HIP batched solver algorithms" OFF) option(BUILD_SHARED_LIBS "Build shared (.so, .dylib, .dll) libraries" ON) option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is OFF." OFF) option(GINKGO_BUILD_PAPI_SDE "Build Ginkgo with PAPI SDE. Enabled if a system installation is found." ${PAPI_SDE_FOUND}) diff --git a/core/solver/batch_bicgstab_kernels.hpp b/core/solver/batch_bicgstab_kernels.hpp index 1eed30aba5a..07ecb1bd834 100644 --- a/core/solver/batch_bicgstab_kernels.hpp +++ b/core/solver/batch_bicgstab_kernels.hpp @@ -6,6 +6,7 @@ #define GKO_CORE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ +#include #include #include #include @@ -15,6 +16,13 @@ #include "core/base/kernel_declaration.hpp" +#ifdef GINKGO_BACTCHED_FULL_OPTIMIZATIONS +constexpr bool bicgstab_no_shared_vecs = false; +#else +constexpr bool bicgstab_no_shared_vecs = true; +#endif + + namespace gko { namespace kernels { namespace batch_bicgstab { @@ -138,7 +146,7 @@ storage_config compute_shared_storage(const int available_shared_mem, // {prec_shared, n_shared, n_global, gmem_stride_bytes, padded_vec_len} storage_config sconf{false, 0, num_main_vecs, 0, num_rows}; // If available shared mem is zero, set all vecs to global. - if (rem_shared <= 0) { + if (rem_shared <= 0 || bicgstab_no_shared_vecs) { set_gmem_stride_bytes(sconf, vec_size, prec_storage); return sconf; } diff --git a/core/solver/batch_cg_kernels.hpp b/core/solver/batch_cg_kernels.hpp index 6fdb595862e..028223886fe 100644 --- a/core/solver/batch_cg_kernels.hpp +++ b/core/solver/batch_cg_kernels.hpp @@ -6,6 +6,7 @@ #define GKO_CORE_SOLVER_BATCH_CG_KERNELS_HPP_ +#include #include #include #include @@ -15,6 +16,13 @@ #include "core/base/kernel_declaration.hpp" +#ifdef GINKGO_BACTCHED_FULL_OPTIMIZATIONS +constexpr bool cg_no_shared_vecs = false; +#else +constexpr bool cg_no_shared_vecs = true; +#endif + + namespace gko { namespace kernels { namespace batch_cg { @@ -126,7 +134,7 @@ storage_config compute_shared_storage(const int available_shared_mem, // {prec_shared, n_shared, n_global, gmem_stride_bytes, padded_vec_len} storage_config sconf{false, 0, num_main_vecs, 0, num_rows}; // If available shared mem is zero, set all vecs to global. - if (rem_shared <= 0) { + if (rem_shared <= 0 || cg_no_shared_vecs) { set_gmem_stride_bytes(sconf, vec_bytes, prec_storage); return sconf; } diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 6b3dca28607..bc12fc7efde 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -167,6 +167,9 @@ public: value_type* const workspace_data = workspace.get_data(); + // Only instantiate when full optimizations has been enabled. Otherwise, + // just use the default one with no shared memory. +#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // Template parameters launch_apply_kernel if (sconf.prec_shared) { @@ -229,6 +232,11 @@ public: GKO_NOT_IMPLEMENTED; } } +#else + launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, workspace_data, + block_size, shared_size); +#endif } private: diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index 746be0365e7..f09b6c70487 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -165,6 +165,9 @@ public: value_type* const workspace_data = workspace.get_data(); + // Only instantiate when full optimizations has been enabled. Otherwise, + // just use the default one with no shared memory. +#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // Template parameters launch_apply_kernel if (sconf.prec_shared) { @@ -207,6 +210,11 @@ public: GKO_NOT_IMPLEMENTED; } } +#else + launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, workspace_data, + block_size, shared_size); +#endif } private: diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp index 344e4af56b9..3b6d5d1c5df 100644 --- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp +++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp @@ -159,6 +159,9 @@ class kernel_caller { ValueType* const workspace_data = workspace.get_data(); int n_shared_total = sconf.n_shared + int(sconf.prec_shared); + // Only instantiate when full optimizations has been enabled. Otherwise, + // just use the default one with no shared memory. +#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // template // launch_apply_kernel if (num_rows <= 32 && n_shared_total == 10) { @@ -230,6 +233,11 @@ class kernel_caller { GKO_NOT_IMPLEMENTED; } } +#else + launch_apply_kernel(sconf, logger, prec, mat, b.values, + x.values, workspace_data, + group_size, shared_size); +#endif } private: diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp index 0787afa6fd3..36fbe0dc269 100644 --- a/dpcpp/solver/batch_cg_kernels.dp.cpp +++ b/dpcpp/solver/batch_cg_kernels.dp.cpp @@ -158,6 +158,9 @@ class kernel_caller { ValueType* const workspace_data = workspace.get_data(); int n_shared_total = sconf.n_shared + int(sconf.prec_shared); + // Only instantiate when full optimizations has been enabled. Otherwise, + // just use the default one with no shared memory. +#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // template // launch_apply_kernel if (num_rows <= 32 && n_shared_total == 6) { @@ -205,6 +208,11 @@ class kernel_caller { GKO_NOT_IMPLEMENTED; } } +#else + launch_apply_kernel(sconf, logger, prec, mat, b.values, + x.values, workspace_data, + group_size, shared_size); +#endif } private: diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index 95a49953b3e..54b63983388 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -149,6 +149,9 @@ class kernel_caller { value_type* const workspace_data = workspace.get_data(); + // Only instantiate when full optimizations has been enabled. Otherwise, + // just use the default one with no shared memory. +#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // Template parameters launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, workspace_data, + block_size, shared_size); +#endif } private: diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index 6102749b988..290fd72b9f7 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -149,6 +149,9 @@ class kernel_caller { value_type* const workspace_data = workspace.get_data(); + // Only instantiate when full optimizations has been enabled. Otherwise, + // just use the default one with no shared memory. +#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // Template parameters launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, workspace_data, + block_size, shared_size); +#endif } private: diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in index 329918399d6..4eb3106633f 100644 --- a/include/ginkgo/config.hpp.in +++ b/include/ginkgo/config.hpp.in @@ -31,6 +31,10 @@ #cmakedefine GINKGO_JACOBI_FULL_OPTIMIZATIONS +/* Should we use all optimizations for batched solvers? */ +#cmakedefine GINKGO_BATCHED_FULL_OPTIMIZATIONS + + /* Should we compile Ginkgo specifically to tune values? */ #cmakedefine GINKGO_BENCHMARK_ENABLE_TUNING From 1c6bc7be91e0cf3065ff43a1262afa6c2d202e0b Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 23 Jul 2024 14:26:32 +0200 Subject: [PATCH 132/448] [cuda,hip,dpcpp] disable optimized kernels --- CMakeLists.txt | 1 - core/solver/batch_bicgstab_kernels.hpp | 5 +- core/solver/batch_cg_kernels.hpp | 5 +- cuda/solver/batch_bicgstab_kernels.cu | 123 +++++++++--------- cuda/solver/batch_cg_kernels.cu | 81 ++++++------ dpcpp/solver/batch_bicgstab_kernels.dp.cpp | 139 ++++++++++----------- dpcpp/solver/batch_cg_kernels.dp.cpp | 90 +++++++------ hip/solver/batch_bicgstab_kernels.hip.cpp | 120 +++++++++--------- hip/solver/batch_cg_kernels.hip.cpp | 80 ++++++------ include/ginkgo/config.hpp.in | 4 - 10 files changed, 301 insertions(+), 347 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f60500b4cc9..21832c98592 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,7 +55,6 @@ option(GINKGO_HIP_AMD_UNSAFE_ATOMIC "Compiler uses unsafe floating point atomic option(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS "Split template instantiations for slow-to-compile files. This improves parallel build performance" ON) mark_as_advanced(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS) option(GINKGO_JACOBI_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA Jacobi algorithm" OFF) -option(GINKGO_BATCHED_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA/HIP batched solver algorithms" OFF) option(BUILD_SHARED_LIBS "Build shared (.so, .dylib, .dll) libraries" ON) option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is OFF." OFF) option(GINKGO_BUILD_PAPI_SDE "Build Ginkgo with PAPI SDE. Enabled if a system installation is found." ${PAPI_SDE_FOUND}) diff --git a/core/solver/batch_bicgstab_kernels.hpp b/core/solver/batch_bicgstab_kernels.hpp index 07ecb1bd834..5bab0e43b26 100644 --- a/core/solver/batch_bicgstab_kernels.hpp +++ b/core/solver/batch_bicgstab_kernels.hpp @@ -16,11 +16,8 @@ #include "core/base/kernel_declaration.hpp" -#ifdef GINKGO_BACTCHED_FULL_OPTIMIZATIONS -constexpr bool bicgstab_no_shared_vecs = false; -#else +// TODO: update when splitting kernels constexpr bool bicgstab_no_shared_vecs = true; -#endif namespace gko { diff --git a/core/solver/batch_cg_kernels.hpp b/core/solver/batch_cg_kernels.hpp index 028223886fe..031b20b2a61 100644 --- a/core/solver/batch_cg_kernels.hpp +++ b/core/solver/batch_cg_kernels.hpp @@ -16,11 +16,8 @@ #include "core/base/kernel_declaration.hpp" -#ifdef GINKGO_BACTCHED_FULL_OPTIMIZATIONS -constexpr bool cg_no_shared_vecs = false; -#else +// TODO: update when splitting compilation constexpr bool cg_no_shared_vecs = true; -#endif namespace gko { diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index bc12fc7efde..54f489304a7 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -167,76 +167,69 @@ public: value_type* const workspace_data = workspace.get_data(); - // Only instantiate when full optimizations has been enabled. Otherwise, - // just use the default one with no shared memory. -#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS + // TODO: split compilation // Template parameters launch_apply_kernel - if (sconf.prec_shared) { - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - } else { - switch (sconf.n_shared) { - case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 1: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 2: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 3: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 4: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 5: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 6: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 7: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 8: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 9: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - default: - GKO_NOT_IMPLEMENTED; - } - } -#else + // if (sconf.prec_shared) { + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, workspace_data, + // block_size, shared_size); + // } else { + // switch (sconf.n_shared) { + // case 0: launch_apply_kernel( sconf, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); -#endif + // break; + // case 1: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 2: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 3: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 4: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 5: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 6: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 7: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 8: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 9: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // default: + // GKO_NOT_IMPLEMENTED; + // } + // } } private: diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index f09b6c70487..b681bd13ce3 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -165,56 +165,51 @@ public: value_type* const workspace_data = workspace.get_data(); + // TODO: split compilation // Only instantiate when full optimizations has been enabled. Otherwise, // just use the default one with no shared memory. -#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // Template parameters launch_apply_kernel - if (sconf.prec_shared) { - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - } else { - switch (sconf.n_shared) { - case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 1: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 2: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 3: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 4: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 5: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - default: - GKO_NOT_IMPLEMENTED; - } - } -#else + // if (sconf.prec_shared) { + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, workspace_data, + // block_size, shared_size); + // } else { + // switch (sconf.n_shared) { + // case 0: launch_apply_kernel( sconf, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); -#endif + // break; + // case 1: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 2: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 3: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 4: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 5: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // default: + // GKO_NOT_IMPLEMENTED; + // } + // } } private: diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp index 3b6d5d1c5df..bb84283b49f 100644 --- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp +++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp @@ -159,85 +159,80 @@ class kernel_caller { ValueType* const workspace_data = workspace.get_data(); int n_shared_total = sconf.n_shared + int(sconf.prec_shared); + // TODO: split compilation // Only instantiate when full optimizations has been enabled. Otherwise, // just use the default one with no shared memory. -#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // template // launch_apply_kernel - if (num_rows <= 32 && n_shared_total == 10) { - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - group_size, shared_size); - } else if (num_rows <= 256 && n_shared_total == 10) { - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - group_size, shared_size); - } else { - switch (n_shared_total) { - case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 1: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 2: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 3: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 4: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 5: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 6: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 7: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 8: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 9: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 10: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - default: - GKO_NOT_IMPLEMENTED; - } - } -#else + // if (num_rows <= 32 && n_shared_total == 10) { + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, workspace_data, + // group_size, shared_size); + // } else if (num_rows <= 256 && n_shared_total == 10) { + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, workspace_data, + // group_size, shared_size); + // } else { + // switch (n_shared_total) { + // case 0: launch_apply_kernel(sconf, logger, prec, mat, b.values, x.values, workspace_data, group_size, shared_size); -#endif + // break; + // case 1: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 2: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 3: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 4: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 5: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 6: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 7: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 8: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 9: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 10: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // default: + // GKO_NOT_IMPLEMENTED; + // } + // } } private: diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp index 36fbe0dc269..61591f9efb6 100644 --- a/dpcpp/solver/batch_cg_kernels.dp.cpp +++ b/dpcpp/solver/batch_cg_kernels.dp.cpp @@ -160,59 +160,53 @@ class kernel_caller { // Only instantiate when full optimizations has been enabled. Otherwise, // just use the default one with no shared memory. -#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // template // launch_apply_kernel - if (num_rows <= 32 && n_shared_total == 6) { - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - group_size, shared_size); - } else { - switch (n_shared_total) { - case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 1: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 2: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 3: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 4: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 5: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 6: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - default: - GKO_NOT_IMPLEMENTED; - } - } -#else + // if (num_rows <= 32 && n_shared_total == 6) { + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, workspace_data, + // group_size, shared_size); + // } else { + // switch (n_shared_total) { + // case 0: launch_apply_kernel(sconf, logger, prec, mat, b.values, x.values, workspace_data, group_size, shared_size); -#endif + // break; + // case 1: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 2: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 3: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 4: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 5: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 6: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // default: + // GKO_NOT_IMPLEMENTED; + // } + // } } private: diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index 54b63983388..ca49fa5eb9c 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -151,74 +151,68 @@ class kernel_caller { // Only instantiate when full optimizations has been enabled. Otherwise, // just use the default one with no shared memory. -#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // Template parameters launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - } else { - switch (sconf.n_shared) { - case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 1: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 2: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 3: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 4: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 5: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 6: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 7: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 8: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 9: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - default: - GKO_NOT_IMPLEMENTED; - } - } -#else + // if (sconf.prec_shared) { + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, workspace_data, + // block_size, shared_size); + // } else { + // switch (sconf.n_shared) { + // case 0: launch_apply_kernel( sconf, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); -#endif + // break; + // case 1: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 2: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 3: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 4: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 5: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 6: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 7: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 8: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 9: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // default: + // GKO_NOT_IMPLEMENTED; + // } + // } } private: diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index 290fd72b9f7..3a1642edfea 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -151,54 +151,48 @@ class kernel_caller { // Only instantiate when full optimizations has been enabled. Otherwise, // just use the default one with no shared memory. -#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // Template parameters launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - } else { - switch (sconf.n_shared) { - case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 1: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 2: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 3: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 4: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 5: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - default: - GKO_NOT_IMPLEMENTED; - } - } -#else + // if (sconf.prec_shared) { + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, workspace_data, + // block_size, shared_size); + // } else { + // switch (sconf.n_shared) { + // case 0: launch_apply_kernel( sconf, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); -#endif + // break; + // case 1: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 2: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 3: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 4: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 5: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // default: + // GKO_NOT_IMPLEMENTED; + // } + // } } private: diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in index 4eb3106633f..329918399d6 100644 --- a/include/ginkgo/config.hpp.in +++ b/include/ginkgo/config.hpp.in @@ -31,10 +31,6 @@ #cmakedefine GINKGO_JACOBI_FULL_OPTIMIZATIONS -/* Should we use all optimizations for batched solvers? */ -#cmakedefine GINKGO_BATCHED_FULL_OPTIMIZATIONS - - /* Should we compile Ginkgo specifically to tune values? */ #cmakedefine GINKGO_BENCHMARK_ENABLE_TUNING From 26b362b39f2901a4951b5a42467640052cbc304d Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 5 Aug 2024 13:02:24 +0200 Subject: [PATCH 133/448] [review] review updates --- core/solver/batch_bicgstab_kernels.hpp | 1 - core/solver/batch_cg_kernels.hpp | 1 - 2 files changed, 2 deletions(-) diff --git a/core/solver/batch_bicgstab_kernels.hpp b/core/solver/batch_bicgstab_kernels.hpp index 5bab0e43b26..615ed472597 100644 --- a/core/solver/batch_bicgstab_kernels.hpp +++ b/core/solver/batch_bicgstab_kernels.hpp @@ -6,7 +6,6 @@ #define GKO_CORE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ -#include #include #include #include diff --git a/core/solver/batch_cg_kernels.hpp b/core/solver/batch_cg_kernels.hpp index 031b20b2a61..b21a2c07d3e 100644 --- a/core/solver/batch_cg_kernels.hpp +++ b/core/solver/batch_cg_kernels.hpp @@ -6,7 +6,6 @@ #define GKO_CORE_SOLVER_BATCH_CG_KERNELS_HPP_ -#include #include #include #include From e1eedfef4b5b7d8b43cfe9363ee10478080fa11f Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 13 Aug 2024 13:08:19 +0200 Subject: [PATCH 134/448] use smaller block size on cuda --- cuda/solver/batch_bicgstab_kernels.cu | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 54f489304a7..3c7fe50709c 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -144,10 +144,11 @@ public: const int shmem_per_blk = get_max_dynamic_shared_memory(exec_); - const int block_size = - get_num_threads_per_block( - exec_, mat.num_rows); + // TODO + const int block_size = 256; + // get_num_threads_per_block( + // exec_, mat.num_rows); GKO_ASSERT(block_size >= 2 * config::warp_size); const size_t prec_size = PrecType::dynamic_work_size( From c38735c51d7d70ab0228aa13f588eb1a7048230d Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 16 Aug 2024 12:43:44 +0200 Subject: [PATCH 135/448] [cmake] add rocthrust through cmake --- cmake/GinkgoConfig.cmake.in | 1 + cmake/hip.cmake | 1 + hip/CMakeLists.txt | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index 23b1d25adc1..1f12251f93d 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -175,6 +175,7 @@ if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_HIP) find_dependency(hiprand) find_dependency(hipsparse) find_dependency(rocrand) + find_dependency(rocthrust) set_and_check(ROCTRACER_PATH "@ROCTRACER_PATH@") find_dependency(ROCTX) endif() diff --git a/cmake/hip.cmake b/cmake/hip.cmake index c94117242eb..bd834c3ebde 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -123,6 +123,7 @@ find_package(hiprand REQUIRED) find_package(hipsparse REQUIRED) # At the moment, for hiprand to work also rocrand is required. find_package(rocrand REQUIRED) +find_package(rocthrust REQUIRED) find_package(ROCTX) if(GINKGO_HIP_AMD_UNSAFE_ATOMIC AND GINKGO_HIP_VERSION VERSION_GREATER_EQUAL 5) diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 30e675509d5..46b2d7bd19b 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -65,7 +65,7 @@ target_include_directories(ginkgo_hip target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip) target_link_libraries(ginkgo_hip PUBLIC ginkgo_device) -target_link_libraries(ginkgo_hip PRIVATE hip::host roc::hipblas roc::hipsparse hip::hiprand roc::rocrand) +target_link_libraries(ginkgo_hip PRIVATE hip::host roc::hipblas roc::hipsparse hip::hiprand roc::rocrand roc::rocthrust) if (hipfft_FOUND) target_link_libraries(ginkgo_hip PRIVATE hip::hipfft) endif() From 65980d34c4df73e3c166a1596fae12a92a3690a1 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 16 Aug 2024 00:31:07 +0200 Subject: [PATCH 136/448] use tbb from onemkl, and add the path after installing --- .github/workflows/intel.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml index 7a1e97a80e8..0aa435dfee3 100644 --- a/.github/workflows/intel.yml +++ b/.github/workflows/intel.yml @@ -33,20 +33,20 @@ jobs: - name: configure run: | source /etc/profile - module load intel-oneapi-compilers intel-oneapi-dpl intel-oneapi-mkl intel-oneapi-tbb cmake + module load intel-oneapi-compilers intel-oneapi-dpl intel-oneapi-mkl cmake mkdir build cd build cmake .. -DCMAKE_INSTALL_PREFIX=install_ginkgo -DCMAKE_CXX_FLAGS="-Wpedantic -ffp-model=precise" -DCMAKE_CXX_COMPILER=${{ matrix.config.compiler }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_MPI=OFF -DGINKGO_DPCPP_SINGLE_MODE=ON make -j8 ONEAPI_DEVICE_SELECTOR=level_zero:gpu ctest -j10 --output-on-failure - + - name: install run: | source /etc/profile - module load intel-oneapi-compilers intel-oneapi-dpl intel-oneapi-mkl intel-oneapi-tbb cmake + module load intel-oneapi-compilers intel-oneapi-dpl intel-oneapi-mkl cmake cd build SYCL_DEVICE_FILTER=level_zero:gpu make install export GINKGO_PATH="$(pwd)/install_ginkgo/lib" - export LIBRARY_PATH=${GINKGO_PATH}:$LIBRARY_PATH - export LD_LIBRARY_PATH=${GINKGO_PATH}:$LD_LIBRARY_PATH + export LIBRARY_PATH=${ICL_INTEL_TBB_ROOT}/lib64:${GINKGO_PATH}:$LIBRARY_PATH + export LD_LIBRARY_PATH=${ICL_INTEL_TBB_ROOT}/lib64:${GINKGO_PATH}:$LD_LIBRARY_PATH SYCL_DEVICE_FILTER=level_zero:gpu make test_install From 06af951d63d6ae66fb7c801241d8a83f52be5dea Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 6 Aug 2024 11:24:38 +0200 Subject: [PATCH 137/448] e uses next level precision, but the coarest solver uses the last level precision. Thus, we can not cast e to current level precision unless it is the last level --- core/solver/multigrid.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index 6a8b5ee151b..35ad7b5d1fe 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -486,7 +486,7 @@ void MultigridState::run_cycle(multigrid::cycle cycle, size_type level, auto r = r_list.at(level); auto g = g_list.at(level); - auto e = as(e_list.at(level)); + auto e = e_list.at(level); // get mg_level auto mg_level = multigrid->get_mg_level_list().at(level); // get the pre_smoother @@ -537,7 +537,7 @@ void MultigridState::run_cycle(multigrid::cycle cycle, size_type level, // next level if (level + 1 == total_level) { // the coarsest solver use the last level valuetype - e->fill(zero()); + as(e)->fill(zero()); } auto next_level_matrix = (level + 1 < total_level) From 2d3d622f83128303e33fd6675ebb3e46e6a082c7 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 23 Jul 2024 13:34:59 +0200 Subject: [PATCH 138/448] unify cuda/hip batch_mvec --- common/cuda_hip/CMakeLists.txt | 1 + ...hpp.inc => batch_multi_vector_kernels.cpp} | 67 +++- .../base/batch_multi_vector_kernels.hpp | 326 ++++++++++++++++++ .../base/batch_multi_vector_kernels.hpp.inc | 43 +-- cuda/CMakeLists.txt | 1 - cuda/base/batch_multi_vector_kernels.cu | 56 --- hip/CMakeLists.txt | 1 - hip/base/batch_multi_vector_kernels.hip.cpp | 56 --- 8 files changed, 390 insertions(+), 161 deletions(-) rename common/cuda_hip/base/{batch_multi_vector_kernel_launcher.hpp.inc => batch_multi_vector_kernels.cpp} (67%) create mode 100644 common/cuda_hip/base/batch_multi_vector_kernels.hpp delete mode 100644 cuda/base/batch_multi_vector_kernels.cu delete mode 100644 hip/base/batch_multi_vector_kernels.hip.cpp diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index 463abfd9284..15d3a82419e 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -1,5 +1,6 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) set(CUDA_HIP_SOURCES + base/batch_multi_vector_kernels.cpp base/device_matrix_data_kernels.cpp base/index_set_kernels.cpp components/prefix_sum_kernels.cpp diff --git a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.cpp similarity index 67% rename from common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc rename to common/cuda_hip/base/batch_multi_vector_kernels.cpp index 19b5b74a547..17f65487464 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp @@ -2,6 +2,32 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" + +#include +#include + +#include +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "core/base/batch_multi_vector_kernels.hpp" +#include "core/base/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_multi_vector { + + +constexpr auto default_block_size = 256; + + template void scale(std::shared_ptr exec, const batch::MultiVector* const alpha, @@ -11,16 +37,19 @@ void scale(std::shared_ptr exec, const auto alpha_ub = get_batch_struct(alpha); const auto x_ub = get_batch_struct(x); if (alpha->get_common_size()[1] == 1) { - scale_kernel<<get_stream()>>>( + batch_single_kernels::scale_kernel<<get_stream()>>>( alpha_ub, x_ub, [] __device__(int row, int col, int stride) { return 0; }); } else if (alpha->get_common_size() == x->get_common_size()) { - scale_kernel<<get_stream()>>>( + batch_single_kernels::scale_kernel<<get_stream()>>>( alpha_ub, x_ub, [] __device__(int row, int col, int stride) { return row * stride + col; }); } else { - scale_kernel<<get_stream()>>>( + batch_single_kernels::scale_kernel<<get_stream()>>>( alpha_ub, x_ub, [] __device__(int row, int col, int stride) { return col; }); } @@ -42,12 +71,12 @@ void add_scaled(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); if (alpha->get_common_size()[1] == 1) { - add_scaled_kernel<<get_stream()>>>( + batch_single_kernels::add_scaled_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( alpha_ub, x_ub, y_ub, [] __device__(int col) { return 0; }); } else { - add_scaled_kernel<<get_stream()>>>( + batch_single_kernels::add_scaled_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( alpha_ub, x_ub, y_ub, [] __device__(int col) { return col; }); } } @@ -67,8 +96,8 @@ void compute_dot(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); const auto res_ub = get_batch_struct(result); - compute_gen_dot_product_kernel<<get_stream()>>>( + batch_single_kernels::compute_gen_dot_product_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( x_ub, y_ub, res_ub, [] __device__(auto val) { return val; }); } @@ -87,8 +116,8 @@ void compute_conj_dot(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); const auto res_ub = get_batch_struct(result); - compute_gen_dot_product_kernel<<get_stream()>>>( + batch_single_kernels::compute_gen_dot_product_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); }); } @@ -105,8 +134,9 @@ void compute_norm2(std::shared_ptr exec, const auto num_rhs = x->get_common_size()[1]; const auto x_ub = get_batch_struct(x); const auto res_ub = get_batch_struct(result); - compute_norm2_kernel<<get_stream()>>>(x_ub, res_ub); + batch_single_kernels::compute_norm2_kernel<<get_stream()>>>( + x_ub, res_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -121,8 +151,15 @@ void copy(std::shared_ptr exec, const auto num_blocks = x->get_num_batch_items(); const auto result_ub = get_batch_struct(result); const auto x_ub = get_batch_struct(x); - copy_kernel<<get_stream()>>>( - x_ub, result_ub); + batch_single_kernels:: + copy_kernel<<get_stream()>>>( + x_ub, result_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL); + + +} // namespace batch_multi_vector +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp new file mode 100644 index 00000000000..36aa69d7d99 --- /dev/null +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp @@ -0,0 +1,326 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include + +#include +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" + +#if defined(GKO_COMPILING_CUDA) +#include "cuda/base/batch_struct.hpp" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/batch_struct.hip.hpp" +#else +#error "batch struct def missing" +#endif + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_multi_vector { +namespace batch_single_kernels { + + +constexpr auto default_block_size = 256; + + +template +__device__ __forceinline__ void scale( + const gko::batch::multi_vector::batch_item& alpha, + const gko::batch::multi_vector::batch_item& x, Mapping map) +{ + const int max_li = x.num_rows * x.num_rhs; + for (int li = threadIdx.x; li < max_li; li += blockDim.x) { + const int row = li / x.num_rhs; + const int col = li % x.num_rhs; + + x.values[row * x.stride + col] = + alpha.values[map(row, col, alpha.stride)] * + x.values[row * x.stride + col]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void scale_kernel( + const gko::batch::multi_vector::uniform_batch alpha, + const gko::batch::multi_vector::uniform_batch x, Mapping map) +{ + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; + batch_id += gridDim.x) { + const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + scale(alpha_b, x_b, map); + } +} + + +template +__device__ __forceinline__ void add_scaled( + const gko::batch::multi_vector::batch_item& alpha, + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, Mapping map) +{ + const int max_li = x.num_rows * x.num_rhs; + for (int li = threadIdx.x; li < max_li; li += blockDim.x) { + const int row = li / x.num_rhs; + const int col = li % x.num_rhs; + + y.values[row * y.stride + col] += + alpha.values[map(col)] * x.values[row * x.stride + col]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void add_scaled_kernel( + const gko::batch::multi_vector::uniform_batch alpha, + const gko::batch::multi_vector::uniform_batch x, + const gko::batch::multi_vector::uniform_batch y, Mapping map) +{ + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; + batch_id += gridDim.x) { + const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto y_b = gko::batch::extract_batch_item(y, batch_id); + add_scaled(alpha_b, x_b, y_b, map); + } +} + + +template +__device__ __forceinline__ void single_rhs_compute_conj_dot(Group subgroup, + const int num_rows, + const ValueType* x, + const ValueType* y, + ValueType& result) + +{ + ValueType val = zero(); + for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) { + val += conj(x[r]) * y[r]; + } + + // subgroup level reduction + val = reduce(subgroup, val, thrust::plus{}); + + if (subgroup.thread_rank() == 0) { + result = val; + } +} + + +template +__device__ __forceinline__ void gen_one_dot( + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, + const int rhs_index, + const gko::batch::multi_vector::batch_item& result, + Group subgroup, Mapping conj_map) +{ + ValueType val = zero(); + + for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) { + val += conj_map(x.values[r * x.stride + rhs_index]) * + y.values[r * y.stride + rhs_index]; + } + + // subgroup level reduction + val = reduce(subgroup, val, thrust::plus{}); + + if (subgroup.thread_rank() == 0) { + result.values[rhs_index] = val; + } +} + + +template +__device__ __forceinline__ void compute_gen_dot_product( + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, + const gko::batch::multi_vector::batch_item& result, + Mapping conj_map) +{ + constexpr auto tile_size = config::warp_size; + auto thread_block = group::this_thread_block(); + auto subgroup = group::tiled_partition(thread_block); + const auto subgroup_id = static_cast(threadIdx.x / tile_size); + const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size); + + for (int rhs_index = subgroup_id; rhs_index < x.num_rhs; + rhs_index += num_subgroups_per_block) { + gen_one_dot(x, y, rhs_index, result, subgroup, conj_map); + } +} + + +template +__global__ +__launch_bounds__(default_block_size) void compute_gen_dot_product_kernel( + const gko::batch::multi_vector::uniform_batch x, + const gko::batch::multi_vector::uniform_batch y, + const gko::batch::multi_vector::uniform_batch result, + Mapping map) +{ + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; + batch_id += gridDim.x) { + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto y_b = gko::batch::extract_batch_item(y, batch_id); + const auto r_b = gko::batch::extract_batch_item(result, batch_id); + compute_gen_dot_product(x_b, y_b, r_b, map); + } +} + + +template +__device__ __forceinline__ void single_rhs_compute_norm2( + Group subgroup, const int num_rows, const ValueType* x, + remove_complex& result) +{ + using real_type = typename gko::remove_complex; + real_type val = zero(); + + for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) { + val += squared_norm(x[r]); + } + + // subgroup level reduction + val = reduce(subgroup, val, thrust::plus>{}); + + if (subgroup.thread_rank() == 0) { + result = sqrt(val); + } +} + + +template +__device__ __forceinline__ void one_norm2( + const gko::batch::multi_vector::batch_item& x, + const int rhs_index, + const gko::batch::multi_vector::batch_item>& + result, + Group subgroup) +{ + using real_type = typename gko::remove_complex; + real_type val = zero(); + + for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) { + val += squared_norm(x.values[r * x.stride + rhs_index]); + } + + // subgroup level reduction + val = reduce(subgroup, val, thrust::plus>{}); + + if (subgroup.thread_rank() == 0) { + result.values[rhs_index] = sqrt(val); + } +} + + +/** + * Computes the 2-norms of some column vectors in global or shared memory. + * + * @param x A row-major multivector with nrhs columns. + * @param result Holds norm value for each vector in x. + */ +template +__device__ __forceinline__ void compute_norm2( + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item>& + result) +{ + constexpr auto tile_size = config::warp_size; + auto thread_block = group::this_thread_block(); + auto subgroup = group::tiled_partition(thread_block); + const auto subgroup_id = static_cast(threadIdx.x / tile_size); + const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size); + + for (int rhs_index = subgroup_id; rhs_index < x.num_rhs; + rhs_index += num_subgroups_per_block) { + one_norm2(x, rhs_index, result, subgroup); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void compute_norm2_kernel( + const gko::batch::multi_vector::uniform_batch x, + const gko::batch::multi_vector::uniform_batch> + result) +{ + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; + batch_id += gridDim.x) { + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto r_b = gko::batch::extract_batch_item(result, batch_id); + compute_norm2(x_b, r_b); + } +} + + +template +__device__ __forceinline__ void single_rhs_copy(const int num_rows, + const ValueType* in, + ValueType* out) +{ + for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) { + out[iz] = in[iz]; + } +} + + +/** + * Copies the values of one multi-vector into another. + * + * Note that the output multi-vector should already have memory allocated + * and stride set. + */ +template +__device__ __forceinline__ void copy( + const gko::batch::multi_vector::batch_item& in, + const gko::batch::multi_vector::batch_item& out) +{ + for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs; + iz += blockDim.x) { + const int i = iz / in.num_rhs; + const int j = iz % in.num_rhs; + out.values[i * out.stride + j] = in.values[i * in.stride + j]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void copy_kernel( + const gko::batch::multi_vector::uniform_batch src, + const gko::batch::multi_vector::uniform_batch dst) +{ + for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_items; + batch_id += gridDim.x) { + const auto dst_b = gko::batch::extract_batch_item(dst, batch_id); + const auto src_b = gko::batch::extract_batch_item(src, batch_id); + copy(src_b, dst_b); + } +} + + +} // namespace batch_single_kernels +} // namespace batch_multi_vector +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc index 9b6301674be..7af3c84303f 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -20,8 +20,7 @@ __device__ __forceinline__ void scale( template -__global__ -__launch_bounds__(default_block_size, sm_oversubscription) void scale_kernel( +__global__ __launch_bounds__(default_block_size) void scale_kernel( const gko::batch::multi_vector::uniform_batch alpha, const gko::batch::multi_vector::uniform_batch x, Mapping map) { @@ -52,20 +51,10 @@ __device__ __forceinline__ void add_scaled( template -__global__ __launch_bounds__( - default_block_size, - sm_oversubscription) void add_scaled_kernel(const gko::batch::multi_vector:: - uniform_batch< - const ValueType> - alpha, - const gko::batch::multi_vector:: - uniform_batch< - const ValueType> - x, - const gko::batch::multi_vector:: - uniform_batch - y, - Mapping map) +__global__ __launch_bounds__(default_block_size) void add_scaled_kernel( + const gko::batch::multi_vector::uniform_batch alpha, + const gko::batch::multi_vector::uniform_batch x, + const gko::batch::multi_vector::uniform_batch y, Mapping map) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; batch_id += gridDim.x) { @@ -145,7 +134,7 @@ __device__ __forceinline__ void compute_gen_dot_product( template __global__ -__launch_bounds__(default_block_size, sm_oversubscription) void compute_gen_dot_product_kernel( +__launch_bounds__(default_block_size) void compute_gen_dot_product_kernel( const gko::batch::multi_vector::uniform_batch x, const gko::batch::multi_vector::uniform_batch y, const gko::batch::multi_vector::uniform_batch result, @@ -232,19 +221,10 @@ __device__ __forceinline__ void compute_norm2( template -__global__ __launch_bounds__( - default_block_size, - sm_oversubscription) void compute_norm2_kernel(const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - x, - const gko::batch:: - multi_vector:: - uniform_batch< - remove_complex< - ValueType>> - result) +__global__ __launch_bounds__(default_block_size) void compute_norm2_kernel( + const gko::batch::multi_vector::uniform_batch x, + const gko::batch::multi_vector::uniform_batch> + result) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; batch_id += gridDim.x) { @@ -287,8 +267,7 @@ __device__ __forceinline__ void copy( template -__global__ -__launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel( +__global__ __launch_bounds__(default_block_size) void copy_kernel( const gko::batch::multi_vector::uniform_batch src, const gko::batch::multi_vector::uniform_batch dst) { diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index d4a94eda802..3631a65f48d 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -7,7 +7,6 @@ add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kerne list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_cuda PRIVATE - base/batch_multi_vector_kernels.cu base/device.cpp base/exception.cpp base/executor.cpp diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu deleted file mode 100644 index 3dad5ba94f1..00000000000 --- a/cuda/base/batch_multi_vector_kernels.cu +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/base/batch_multi_vector_kernels.hpp" - -#include -#include - -#include -#include - -#include "common/cuda_hip/base/blas_bindings.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/pointer_mode_guard.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" -#include "core/base/batch_struct.hpp" -#include "cuda/base/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The MultiVector matrix format namespace. - * - * @ingroup batch_multi_vector - */ -namespace batch_multi_vector { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" - - -#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_multi_vector -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 46b2d7bd19b..84bba295120 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -5,7 +5,6 @@ add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kerne # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) set(GINKGO_HIP_SOURCES - base/batch_multi_vector_kernels.hip.cpp base/device.hip.cpp base/exception.hip.cpp base/executor.hip.cpp diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp deleted file mode 100644 index 701f4655a9a..00000000000 --- a/hip/base/batch_multi_vector_kernels.hip.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/base/batch_multi_vector_kernels.hpp" - -#include -#include - -#include -#include - -#include "common/cuda_hip/base/blas_bindings.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/pointer_mode_guard.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "core/base/batch_struct.hpp" -#include "hip/base/batch_struct.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The MultiVector matrix format namespace. - * - * @ingroup batch_multi_vector - */ -namespace batch_multi_vector { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" - - -#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_multi_vector -} // namespace hip -} // namespace kernels -} // namespace gko From dd66c6702fb6e8010042f9795c14cf43fe3f5244 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 19 Aug 2024 11:03:12 +0200 Subject: [PATCH 139/448] [cuda,hip] update namespaces and includes --- .../base/batch_multi_vector_kernels.cpp | 37 +-- .../base/batch_multi_vector_kernels.hpp | 2 - .../base/batch_multi_vector_kernels.hpp.inc | 280 ------------------ .../solver/batch_bicgstab_kernels.hpp.inc | 41 ++- .../cuda_hip/solver/batch_cg_kernels.hpp.inc | 23 +- cuda/solver/batch_bicgstab_kernels.cu | 2 +- cuda/solver/batch_cg_kernels.cu | 2 +- hip/solver/batch_bicgstab_kernels.hip.cpp | 2 +- hip/solver/batch_cg_kernels.hip.cpp | 2 +- 9 files changed, 63 insertions(+), 328 deletions(-) delete mode 100644 common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.cpp b/common/cuda_hip/base/batch_multi_vector_kernels.cpp index 17f65487464..76565a83f80 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.cpp +++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp @@ -37,19 +37,19 @@ void scale(std::shared_ptr exec, const auto alpha_ub = get_batch_struct(alpha); const auto x_ub = get_batch_struct(x); if (alpha->get_common_size()[1] == 1) { - batch_single_kernels::scale_kernel<<get_stream()>>>( + GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( alpha_ub, x_ub, [] __device__(int row, int col, int stride) { return 0; }); } else if (alpha->get_common_size() == x->get_common_size()) { - batch_single_kernels::scale_kernel<<get_stream()>>>( + GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( alpha_ub, x_ub, [] __device__(int row, int col, int stride) { return row * stride + col; }); } else { - batch_single_kernels::scale_kernel<<get_stream()>>>( + GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( alpha_ub, x_ub, [] __device__(int row, int col, int stride) { return col; }); } @@ -71,11 +71,11 @@ void add_scaled(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); if (alpha->get_common_size()[1] == 1) { - batch_single_kernels::add_scaled_kernel<<< + GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel<<< num_blocks, default_block_size, 0, exec->get_stream()>>>( alpha_ub, x_ub, y_ub, [] __device__(int col) { return 0; }); } else { - batch_single_kernels::add_scaled_kernel<<< + GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel<<< num_blocks, default_block_size, 0, exec->get_stream()>>>( alpha_ub, x_ub, y_ub, [] __device__(int col) { return col; }); } @@ -96,9 +96,10 @@ void compute_dot(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); const auto res_ub = get_batch_struct(result); - batch_single_kernels::compute_gen_dot_product_kernel<<< - num_blocks, default_block_size, 0, exec->get_stream()>>>( - x_ub, y_ub, res_ub, [] __device__(auto val) { return val; }); + GKO_DEVICE_NAMESPACE::batch_single_kernels:: + compute_gen_dot_product_kernel<<get_stream()>>>( + x_ub, y_ub, res_ub, [] __device__(auto val) { return val; }); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -116,9 +117,10 @@ void compute_conj_dot(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); const auto res_ub = get_batch_struct(result); - batch_single_kernels::compute_gen_dot_product_kernel<<< - num_blocks, default_block_size, 0, exec->get_stream()>>>( - x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); }); + GKO_DEVICE_NAMESPACE::batch_single_kernels:: + compute_gen_dot_product_kernel<<get_stream()>>>( + x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); }); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -134,9 +136,8 @@ void compute_norm2(std::shared_ptr exec, const auto num_rhs = x->get_common_size()[1]; const auto x_ub = get_batch_struct(x); const auto res_ub = get_batch_struct(result); - batch_single_kernels::compute_norm2_kernel<<get_stream()>>>( - x_ub, res_ub); + GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>(x_ub, res_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -151,7 +152,7 @@ void copy(std::shared_ptr exec, const auto num_blocks = x->get_num_batch_items(); const auto result_ub = get_batch_struct(result); const auto x_ub = get_batch_struct(x); - batch_single_kernels:: + GKO_DEVICE_NAMESPACE::batch_single_kernels:: copy_kernel<<get_stream()>>>( x_ub, result_ub); } diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp index 36aa69d7d99..bb3aac67b55 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp @@ -34,7 +34,6 @@ namespace gko { namespace kernels { namespace GKO_DEVICE_NAMESPACE { -namespace batch_multi_vector { namespace batch_single_kernels { @@ -320,7 +319,6 @@ __global__ __launch_bounds__(default_block_size) void copy_kernel( } // namespace batch_single_kernels -} // namespace batch_multi_vector } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc deleted file mode 100644 index 7af3c84303f..00000000000 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ /dev/null @@ -1,280 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -template -__device__ __forceinline__ void scale( - const gko::batch::multi_vector::batch_item& alpha, - const gko::batch::multi_vector::batch_item& x, Mapping map) -{ - const int max_li = x.num_rows * x.num_rhs; - for (int li = threadIdx.x; li < max_li; li += blockDim.x) { - const int row = li / x.num_rhs; - const int col = li % x.num_rhs; - - x.values[row * x.stride + col] = - alpha.values[map(row, col, alpha.stride)] * - x.values[row * x.stride + col]; - } -} - - -template -__global__ __launch_bounds__(default_block_size) void scale_kernel( - const gko::batch::multi_vector::uniform_batch alpha, - const gko::batch::multi_vector::uniform_batch x, Mapping map) -{ - for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; - batch_id += gridDim.x) { - const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id); - const auto x_b = gko::batch::extract_batch_item(x, batch_id); - scale(alpha_b, x_b, map); - } -} - - -template -__device__ __forceinline__ void add_scaled( - const gko::batch::multi_vector::batch_item& alpha, - const gko::batch::multi_vector::batch_item& x, - const gko::batch::multi_vector::batch_item& y, Mapping map) -{ - const int max_li = x.num_rows * x.num_rhs; - for (int li = threadIdx.x; li < max_li; li += blockDim.x) { - const int row = li / x.num_rhs; - const int col = li % x.num_rhs; - - y.values[row * y.stride + col] += - alpha.values[map(col)] * x.values[row * x.stride + col]; - } -} - - -template -__global__ __launch_bounds__(default_block_size) void add_scaled_kernel( - const gko::batch::multi_vector::uniform_batch alpha, - const gko::batch::multi_vector::uniform_batch x, - const gko::batch::multi_vector::uniform_batch y, Mapping map) -{ - for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; - batch_id += gridDim.x) { - const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id); - const auto x_b = gko::batch::extract_batch_item(x, batch_id); - const auto y_b = gko::batch::extract_batch_item(y, batch_id); - add_scaled(alpha_b, x_b, y_b, map); - } -} - - -template -__device__ __forceinline__ void single_rhs_compute_conj_dot(Group subgroup, - const int num_rows, - const ValueType* x, - const ValueType* y, - ValueType& result) - -{ - ValueType val = zero(); - for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) { - val += conj(x[r]) * y[r]; - } - - // subgroup level reduction - val = reduce(subgroup, val, thrust::plus{}); - - if (subgroup.thread_rank() == 0) { - result = val; - } -} - - -template -__device__ __forceinline__ void gen_one_dot( - const gko::batch::multi_vector::batch_item& x, - const gko::batch::multi_vector::batch_item& y, - const int rhs_index, - const gko::batch::multi_vector::batch_item& result, - Group subgroup, Mapping conj_map) -{ - ValueType val = zero(); - - for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) { - val += conj_map(x.values[r * x.stride + rhs_index]) * - y.values[r * y.stride + rhs_index]; - } - - // subgroup level reduction - val = reduce(subgroup, val, thrust::plus{}); - - if (subgroup.thread_rank() == 0) { - result.values[rhs_index] = val; - } -} - - -template -__device__ __forceinline__ void compute_gen_dot_product( - const gko::batch::multi_vector::batch_item& x, - const gko::batch::multi_vector::batch_item& y, - const gko::batch::multi_vector::batch_item& result, - Mapping conj_map) -{ - constexpr auto tile_size = config::warp_size; - auto thread_block = group::this_thread_block(); - auto subgroup = group::tiled_partition(thread_block); - const auto subgroup_id = static_cast(threadIdx.x / tile_size); - const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size); - - for (int rhs_index = subgroup_id; rhs_index < x.num_rhs; - rhs_index += num_subgroups_per_block) { - gen_one_dot(x, y, rhs_index, result, subgroup, conj_map); - } -} - - -template -__global__ -__launch_bounds__(default_block_size) void compute_gen_dot_product_kernel( - const gko::batch::multi_vector::uniform_batch x, - const gko::batch::multi_vector::uniform_batch y, - const gko::batch::multi_vector::uniform_batch result, - Mapping map) -{ - for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; - batch_id += gridDim.x) { - const auto x_b = gko::batch::extract_batch_item(x, batch_id); - const auto y_b = gko::batch::extract_batch_item(y, batch_id); - const auto r_b = gko::batch::extract_batch_item(result, batch_id); - compute_gen_dot_product(x_b, y_b, r_b, map); - } -} - - -template -__device__ __forceinline__ void single_rhs_compute_norm2( - Group subgroup, const int num_rows, const ValueType* x, - remove_complex& result) -{ - using real_type = typename gko::remove_complex; - real_type val = zero(); - - for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) { - val += squared_norm(x[r]); - } - - // subgroup level reduction - val = reduce(subgroup, val, thrust::plus>{}); - - if (subgroup.thread_rank() == 0) { - result = sqrt(val); - } -} - - -template -__device__ __forceinline__ void one_norm2( - const gko::batch::multi_vector::batch_item& x, - const int rhs_index, - const gko::batch::multi_vector::batch_item>& - result, - Group subgroup) -{ - using real_type = typename gko::remove_complex; - real_type val = zero(); - - for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) { - val += squared_norm(x.values[r * x.stride + rhs_index]); - } - - // subgroup level reduction - val = reduce(subgroup, val, thrust::plus>{}); - - if (subgroup.thread_rank() == 0) { - result.values[rhs_index] = sqrt(val); - } -} - - -/** - * Computes the 2-norms of some column vectors in global or shared memory. - * - * @param x A row-major multivector with nrhs columns. - * @param result Holds norm value for each vector in x. - */ -template -__device__ __forceinline__ void compute_norm2( - const gko::batch::multi_vector::batch_item& x, - const gko::batch::multi_vector::batch_item>& - result) -{ - constexpr auto tile_size = config::warp_size; - auto thread_block = group::this_thread_block(); - auto subgroup = group::tiled_partition(thread_block); - const auto subgroup_id = static_cast(threadIdx.x / tile_size); - const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size); - - for (int rhs_index = subgroup_id; rhs_index < x.num_rhs; - rhs_index += num_subgroups_per_block) { - one_norm2(x, rhs_index, result, subgroup); - } -} - - -template -__global__ __launch_bounds__(default_block_size) void compute_norm2_kernel( - const gko::batch::multi_vector::uniform_batch x, - const gko::batch::multi_vector::uniform_batch> - result) -{ - for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; - batch_id += gridDim.x) { - const auto x_b = gko::batch::extract_batch_item(x, batch_id); - const auto r_b = gko::batch::extract_batch_item(result, batch_id); - compute_norm2(x_b, r_b); - } -} - - -template -__device__ __forceinline__ void single_rhs_copy(const int num_rows, - const ValueType* in, - ValueType* out) -{ - for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) { - out[iz] = in[iz]; - } -} - - -/** - * Copies the values of one multi-vector into another. - * - * Note that the output multi-vector should already have memory allocated - * and stride set. - */ -template -__device__ __forceinline__ void copy( - const gko::batch::multi_vector::batch_item& in, - const gko::batch::multi_vector::batch_item& out) -{ - for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs; - iz += blockDim.x) { - const int i = iz / in.num_rhs; - const int j = iz % in.num_rhs; - out.values[i * out.stride + j] = in.values[i * in.stride + j]; - } -} - - -template -__global__ __launch_bounds__(default_block_size) void copy_kernel( - const gko::batch::multi_vector::uniform_batch src, - const gko::batch::multi_vector::uniform_batch dst) -{ - for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_items; - batch_id += gridDim.x) { - const auto dst_b = gko::batch::extract_batch_item(dst, batch_id); - const auto src_b = gko::batch::extract_batch_item(src, batch_id); - copy(src_b, dst_b); - } -} diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc index f71c8c40c3e..c2a53b2e518 100644 --- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc +++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc @@ -32,10 +32,14 @@ __device__ __forceinline__ void initialize( __syncthreads(); if (threadIdx.x / config::warp_size == 0) { - single_rhs_compute_norm2(subgroup, num_rows, r_shared_entry, res_norm); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_norm2(subgroup, num_rows, r_shared_entry, + res_norm); } else if (threadIdx.x / config::warp_size == 1) { // Compute norms of rhs - single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, rhs_norm); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, + rhs_norm); } __syncthreads(); @@ -70,8 +74,9 @@ __device__ __forceinline__ void compute_alpha( const ValueType* const v_shared_entry, ValueType& alpha) { if (threadIdx.x / config::warp_size == 0) { - single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_shared_entry, - v_shared_entry, alpha); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_shared_entry, + v_shared_entry, alpha); } __syncthreads(); if (threadIdx.x == 0) { @@ -99,11 +104,13 @@ __device__ __forceinline__ void compute_omega( const ValueType* const s_shared_entry, ValueType& temp, ValueType& omega) { if (threadIdx.x / config::warp_size == 0) { - single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry, - s_shared_entry, omega); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry, + s_shared_entry, omega); } else if (threadIdx.x / config::warp_size == 1) { - single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry, - t_shared_entry, temp); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry, + t_shared_entry, temp); } __syncthreads(); @@ -271,8 +278,9 @@ __global__ void apply_kernel( // rho_new = < r_hat , r > = (r_hat)' * (r) if (threadIdx.x / config::warp_size == 0) { - single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_sh, r_sh, - rho_new_sh[0]); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_sh, + r_sh, rho_new_sh[0]); } __syncthreads(); @@ -301,8 +309,9 @@ __global__ void apply_kernel( // an estimate of residual norms if (threadIdx.x / config::warp_size == 0) { - single_rhs_compute_norm2(subgroup, num_rows, s_sh, - norms_res_sh[0]); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_norm2(subgroup, num_rows, s_sh, + norms_res_sh[0]); } __syncthreads(); @@ -333,8 +342,9 @@ __global__ void apply_kernel( __syncthreads(); if (threadIdx.x / config::warp_size == 0) { - single_rhs_compute_norm2(subgroup, num_rows, r_sh, - norms_res_sh[0]); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_norm2(subgroup, num_rows, r_sh, + norms_res_sh[0]); } //__syncthreads(); @@ -347,7 +357,8 @@ __global__ void apply_kernel( logger.log_iteration(batch_id, iter, norms_res_sh[0]); // copy x back to global memory - single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr); __syncthreads(); } } diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc index ffee501b58c..c95a6b1cf05 100644 --- a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc +++ b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc @@ -32,12 +32,14 @@ __device__ __forceinline__ void initialize( if (threadIdx.x / config::warp_size == 0) { // Compute norms of rhs - single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, - rhs_norms_sh); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, + rhs_norms_sh); } else if (threadIdx.x / config::warp_size == 1) { // rho_old = r' * z - single_rhs_compute_conj_dot(subgroup, num_rows, r_shared_entry, - z_shared_entry, rho_old_shared_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot(subgroup, num_rows, r_shared_entry, + z_shared_entry, rho_old_shared_entry); } // p = z @@ -69,8 +71,9 @@ __device__ __forceinline__ void update_x_and_r( ValueType* const x_shared_entry, ValueType* const r_shared_entry) { if (threadIdx.x / config::warp_size == 0) { - single_rhs_compute_conj_dot(subgroup, num_rows, p_shared_entry, - Ap_shared_entry, alpha_shared_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot(subgroup, num_rows, p_shared_entry, + Ap_shared_entry, alpha_shared_entry); } __syncthreads(); @@ -202,8 +205,9 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf, if (threadIdx.x / config::warp_size == 0) { // rho_new = (r)' * (z) - single_rhs_compute_conj_dot(subgroup, num_rows, r_sh, z_sh, - rho_new_sh[0]); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot(subgroup, num_rows, r_sh, z_sh, + rho_new_sh[0]); } __syncthreads(); @@ -222,7 +226,8 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf, logger.log_iteration(batch_id, iter, norms_res_sh[0]); // copy x back to global memory - single_rhs_copy(num_rows, x_sh, x_global_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_copy(num_rows, x_sh, x_global_entry); __syncthreads(); } } diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 3c7fe50709c..4d3deb742fe 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -10,6 +10,7 @@ #include #include +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/thrust.hpp" @@ -43,7 +44,6 @@ constexpr int sm_oversubscription = 4; namespace batch_bicgstab { -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index b681bd13ce3..21c3e3d43c4 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -10,6 +10,7 @@ #include #include +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" @@ -42,7 +43,6 @@ constexpr int sm_oversubscription = 4; namespace batch_cg { -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index ca49fa5eb9c..1c1be8b21f7 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -10,6 +10,7 @@ #include #include +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" @@ -42,7 +43,6 @@ constexpr int sm_oversubscription = 4; namespace batch_bicgstab { -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index 3a1642edfea..c860286c17c 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -10,6 +10,7 @@ #include #include +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" @@ -42,7 +43,6 @@ constexpr int sm_oversubscription = 4; namespace batch_cg { -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" From ae1b24b8617b08fc8ab0a847f93cdd1b1e95a981 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 19 Aug 2024 11:32:50 +0200 Subject: [PATCH 140/448] [ref, omp] move kernels to headers --- omp/base/batch_multi_vector_kernels.cpp | 30 +++++++------- omp/solver/batch_bicgstab_kernels.cpp | 2 +- omp/solver/batch_cg_kernels.cpp | 2 +- reference/base/batch_multi_vector_kernels.cpp | 33 +++++++-------- ...hpp.inc => batch_multi_vector_kernels.hpp} | 20 +++++++++ reference/solver/batch_bicgstab_kernels.cpp | 2 +- .../solver/batch_bicgstab_kernels.hpp.inc | 41 ++++++++++++------- reference/solver/batch_cg_kernels.cpp | 2 +- reference/solver/batch_cg_kernels.hpp.inc | 23 +++++++---- 9 files changed, 96 insertions(+), 59 deletions(-) rename reference/base/{batch_multi_vector_kernels.hpp.inc => batch_multi_vector_kernels.hpp} (90%) diff --git a/omp/base/batch_multi_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp index 395bf96cc7a..8a947107479 100644 --- a/omp/base/batch_multi_vector_kernels.cpp +++ b/omp/base/batch_multi_vector_kernels.cpp @@ -10,24 +10,18 @@ #include #include +#include "common/unified/base/kernel_launch.hpp" #include "core/components/prefix_sum_kernels.hpp" +#include "reference/base/batch_multi_vector_kernels.hpp" #include "reference/base/batch_struct.hpp" namespace gko { namespace kernels { -namespace omp { -/** - * @brief The batch::MultiVector matrix format namespace. - * @ref batch::MultiVector - * @ingroup batch_multi_vector - */ +namespace GKO_DEVICE_NAMESPACE { namespace batch_multi_vector { -#include "reference/base/batch_multi_vector_kernels.hpp.inc" - - template void scale(std::shared_ptr exec, const batch::MultiVector* const alpha, @@ -39,7 +33,7 @@ void scale(std::shared_ptr exec, for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { const auto alpha_b = gko::batch::extract_batch_item(alpha_ub, batch); const auto x_b = gko::batch::extract_batch_item(x_ub, batch); - scale_kernel(alpha_b, x_b); + GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel(alpha_b, x_b); } } @@ -61,7 +55,8 @@ void add_scaled(std::shared_ptr exec, const auto alpha_b = gko::batch::extract_batch_item(alpha_ub, batch); const auto x_b = gko::batch::extract_batch_item(x_ub, batch); const auto y_b = gko::batch::extract_batch_item(y_ub, batch); - add_scaled_kernel(alpha_b, x_b, y_b); + GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel(alpha_b, + x_b, y_b); } } @@ -83,7 +78,8 @@ void compute_dot(std::shared_ptr exec, const auto res_b = gko::batch::extract_batch_item(res_ub, batch); const auto x_b = gko::batch::extract_batch_item(x_ub, batch); const auto y_b = gko::batch::extract_batch_item(y_ub, batch); - compute_dot_product_kernel(x_b, y_b, res_b); + GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_dot_product_kernel( + x_b, y_b, res_b); } } @@ -105,7 +101,8 @@ void compute_conj_dot(std::shared_ptr exec, const auto res_b = gko::batch::extract_batch_item(res_ub, batch); const auto x_b = gko::batch::extract_batch_item(x_ub, batch); const auto y_b = gko::batch::extract_batch_item(y_ub, batch); - compute_conj_dot_product_kernel(x_b, y_b, res_b); + GKO_DEVICE_NAMESPACE::batch_single_kernels:: + compute_conj_dot_product_kernel(x_b, y_b, res_b); } } @@ -124,7 +121,8 @@ void compute_norm2(std::shared_ptr exec, for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { const auto res_b = gko::batch::extract_batch_item(res_ub, batch); const auto x_b = gko::batch::extract_batch_item(x_ub, batch); - compute_norm2_kernel(x_b, res_b); + GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel(x_b, + res_b); } } @@ -143,7 +141,7 @@ void copy(std::shared_ptr exec, for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { const auto result_b = gko::batch::extract_batch_item(result_ub, batch); const auto x_b = gko::batch::extract_batch_item(x_ub, batch); - copy_kernel(x_b, result_b); + GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(x_b, result_b); } } @@ -151,6 +149,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL); } // namespace batch_multi_vector -} // namespace omp +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/omp/solver/batch_bicgstab_kernels.cpp b/omp/solver/batch_bicgstab_kernels.cpp index 81df9c45e51..c245f284106 100644 --- a/omp/solver/batch_bicgstab_kernels.cpp +++ b/omp/solver/batch_bicgstab_kernels.cpp @@ -9,6 +9,7 @@ #include #include "core/solver/batch_dispatch.hpp" +#include "reference/base/batch_multi_vector_kernels.hpp" namespace gko { @@ -28,7 +29,6 @@ namespace { constexpr int max_num_rhs = 1; -#include "reference/base/batch_multi_vector_kernels.hpp.inc" #include "reference/matrix/batch_csr_kernels.hpp.inc" #include "reference/matrix/batch_dense_kernels.hpp.inc" #include "reference/matrix/batch_ell_kernels.hpp.inc" diff --git a/omp/solver/batch_cg_kernels.cpp b/omp/solver/batch_cg_kernels.cpp index 51c794ab597..55d6ee29321 100644 --- a/omp/solver/batch_cg_kernels.cpp +++ b/omp/solver/batch_cg_kernels.cpp @@ -9,6 +9,7 @@ #include #include "core/solver/batch_dispatch.hpp" +#include "reference/base/batch_multi_vector_kernels.hpp" namespace gko { @@ -28,7 +29,6 @@ namespace { constexpr int max_num_rhs = 1; -#include "reference/base/batch_multi_vector_kernels.hpp.inc" #include "reference/matrix/batch_csr_kernels.hpp.inc" #include "reference/matrix/batch_dense_kernels.hpp.inc" #include "reference/matrix/batch_ell_kernels.hpp.inc" diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp index b0d20a6b826..c05398226f0 100644 --- a/reference/base/batch_multi_vector_kernels.cpp +++ b/reference/base/batch_multi_vector_kernels.cpp @@ -10,24 +10,21 @@ #include #include + +#define GKO_DEVICE_NAMESPACE reference + + #include "core/base/batch_struct.hpp" +#include "reference/base/batch_multi_vector_kernels.hpp" #include "reference/base/batch_struct.hpp" namespace gko { namespace kernels { -namespace reference { -/** - * @brief The batch::MultiVector matrix format namespace. - * @ref batch::MultiVector - * @ingroup batch_multi_vector - */ +namespace GKO_DEVICE_NAMESPACE { namespace batch_multi_vector { -#include "reference/base/batch_multi_vector_kernels.hpp.inc" - - template void scale(std::shared_ptr exec, const batch::MultiVector* alpha, @@ -38,7 +35,7 @@ void scale(std::shared_ptr exec, for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { const auto alpha_b = batch::extract_batch_item(alpha_ub, batch); const auto x_b = batch::extract_batch_item(x_ub, batch); - scale_kernel(alpha_b, x_b); + GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel(alpha_b, x_b); } } @@ -59,7 +56,8 @@ void add_scaled(std::shared_ptr exec, const auto alpha_b = batch::extract_batch_item(alpha_ub, batch); const auto x_b = batch::extract_batch_item(x_ub, batch); const auto y_b = batch::extract_batch_item(y_ub, batch); - add_scaled_kernel(alpha_b, x_b, y_b); + GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel(alpha_b, + x_b, y_b); } } @@ -80,7 +78,8 @@ void compute_dot(std::shared_ptr exec, const auto res_b = batch::extract_batch_item(res_ub, batch); const auto x_b = batch::extract_batch_item(x_ub, batch); const auto y_b = batch::extract_batch_item(y_ub, batch); - compute_dot_product_kernel(x_b, y_b, res_b); + GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_dot_product_kernel( + x_b, y_b, res_b); } } @@ -101,7 +100,8 @@ void compute_conj_dot(std::shared_ptr exec, const auto res_b = batch::extract_batch_item(res_ub, batch); const auto x_b = batch::extract_batch_item(x_ub, batch); const auto y_b = batch::extract_batch_item(y_ub, batch); - compute_conj_dot_product_kernel(x_b, y_b, res_b); + GKO_DEVICE_NAMESPACE::batch_single_kernels:: + compute_conj_dot_product_kernel(x_b, y_b, res_b); } } @@ -119,7 +119,8 @@ void compute_norm2(std::shared_ptr exec, for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { const auto res_b = batch::extract_batch_item(res_ub, batch); const auto x_b = batch::extract_batch_item(x_ub, batch); - compute_norm2_kernel(x_b, res_b); + GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel(x_b, + res_b); } } @@ -137,7 +138,7 @@ void copy(std::shared_ptr exec, for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { const auto result_b = batch::extract_batch_item(result_ub, batch); const auto x_b = batch::extract_batch_item(x_ub, batch); - copy_kernel(x_b, result_b); + GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(x_b, result_b); } } @@ -145,6 +146,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL); } // namespace batch_multi_vector -} // namespace reference +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/reference/base/batch_multi_vector_kernels.hpp.inc b/reference/base/batch_multi_vector_kernels.hpp similarity index 90% rename from reference/base/batch_multi_vector_kernels.hpp.inc rename to reference/base/batch_multi_vector_kernels.hpp index 24e59664b74..88f531f29cc 100644 --- a/reference/base/batch_multi_vector_kernels.hpp.inc +++ b/reference/base/batch_multi_vector_kernels.hpp @@ -2,6 +2,20 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include +#include +#include + +#include "reference/base/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template inline void scale_kernel( const gko::batch::multi_vector::batch_item& alpha, @@ -129,3 +143,9 @@ inline void copy_kernel( out.values[i * out.stride + j] = in.values[i * in.stride + j]; } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/reference/solver/batch_bicgstab_kernels.cpp b/reference/solver/batch_bicgstab_kernels.cpp index 97de157fb90..e68caffa936 100644 --- a/reference/solver/batch_bicgstab_kernels.cpp +++ b/reference/solver/batch_bicgstab_kernels.cpp @@ -5,6 +5,7 @@ #include "core/solver/batch_bicgstab_kernels.hpp" #include "core/solver/batch_dispatch.hpp" +#include "reference/base/batch_multi_vector_kernels.hpp" namespace gko { @@ -26,7 +27,6 @@ namespace { constexpr int max_num_rhs = 1; -#include "reference/base/batch_multi_vector_kernels.hpp.inc" #include "reference/matrix/batch_csr_kernels.hpp.inc" #include "reference/matrix/batch_dense_kernels.hpp.inc" #include "reference/matrix/batch_ell_kernels.hpp.inc" diff --git a/reference/solver/batch_bicgstab_kernels.hpp.inc b/reference/solver/batch_bicgstab_kernels.hpp.inc index b61db3669ef..1f8537ab66d 100644 --- a/reference/solver/batch_bicgstab_kernels.hpp.inc +++ b/reference/solver/batch_bicgstab_kernels.hpp.inc @@ -25,17 +25,20 @@ inline void initialize( alpha_entry.values[0] = one(); // Compute norms of rhs - compute_norm2_kernel(b_entry, rhs_norms_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + compute_norm2_kernel(b_entry, rhs_norms_entry); // r = b - copy_kernel(b_entry, r_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel( + b_entry, r_entry); // r = b - A*x advanced_apply_kernel(static_cast(-1.0), A_entry, gko::batch::to_const(x_entry), static_cast(1.0), r_entry); - compute_norm2_kernel(gko::batch::to_const(r_entry), - res_norms_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + compute_norm2_kernel(gko::batch::to_const(r_entry), + res_norms_entry); for (int r = 0; r < p_entry.num_rows; r++) { r_hat_entry.values[r * r_hat_entry.stride] = @@ -75,7 +78,9 @@ inline void compute_alpha( const gko::batch::multi_vector::batch_item& v_entry, const gko::batch::multi_vector::batch_item& alpha_entry) { - compute_dot_product_kernel(r_hat_entry, v_entry, alpha_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + compute_dot_product_kernel(r_hat_entry, v_entry, + alpha_entry); alpha_entry.values[0] = rho_new_entry.values[0] / alpha_entry.values[0]; } @@ -102,8 +107,10 @@ inline void compute_omega( const gko::batch::multi_vector::batch_item& temp_entry, const gko::batch::multi_vector::batch_item& omega_entry) { - compute_dot_product_kernel(t_entry, s_entry, omega_entry); - compute_dot_product_kernel(t_entry, t_entry, temp_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + compute_dot_product_kernel(t_entry, s_entry, omega_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + compute_dot_product_kernel(t_entry, t_entry, temp_entry); omega_entry.values[0] /= temp_entry.values[0]; } @@ -246,9 +253,10 @@ inline void batch_entry_bicgstab_impl( } // rho_new = < r_hat , r > = (r_hat)' * (r) - compute_dot_product_kernel(gko::batch::to_const(r_hat_entry), - gko::batch::to_const(r_entry), - rho_new_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + compute_dot_product_kernel( + gko::batch::to_const(r_hat_entry), + gko::batch::to_const(r_entry), rho_new_entry); // beta = (rho_new / rho_old)*(alpha / omega) // p = r + beta*(p - omega * v) @@ -277,8 +285,9 @@ inline void batch_entry_bicgstab_impl( gko::batch::to_const(v_entry), s_entry); // an estimate of residual norms - compute_norm2_kernel(gko::batch::to_const(s_entry), - res_norms_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + compute_norm2_kernel(gko::batch::to_const(s_entry), + res_norms_entry); if (stop.check_converged(res_norms_entry.values)) { // update x for the systems @@ -310,11 +319,13 @@ inline void batch_entry_bicgstab_impl( gko::batch::to_const(s_entry), gko::batch::to_const(t_entry), x_entry, r_entry); - compute_norm2_kernel(gko::batch::to_const(r_entry), - res_norms_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + compute_norm2_kernel(gko::batch::to_const(r_entry), + res_norms_entry); // rho_old = rho_new - copy_kernel(gko::batch::to_const(rho_new_entry), rho_old_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel( + gko::batch::to_const(rho_new_entry), rho_old_entry); } logger.log_iteration(batch_item_id, iter, res_norms_entry.values[0]); diff --git a/reference/solver/batch_cg_kernels.cpp b/reference/solver/batch_cg_kernels.cpp index 290fbc3718b..785a7a868a2 100644 --- a/reference/solver/batch_cg_kernels.cpp +++ b/reference/solver/batch_cg_kernels.cpp @@ -5,6 +5,7 @@ #include "core/solver/batch_cg_kernels.hpp" #include "core/solver/batch_dispatch.hpp" +#include "reference/base/batch_multi_vector_kernels.hpp" namespace gko { @@ -26,7 +27,6 @@ namespace { constexpr int max_num_rhs = 1; -#include "reference/base/batch_multi_vector_kernels.hpp.inc" #include "reference/matrix/batch_csr_kernels.hpp.inc" #include "reference/matrix/batch_dense_kernels.hpp.inc" #include "reference/matrix/batch_ell_kernels.hpp.inc" diff --git a/reference/solver/batch_cg_kernels.hpp.inc b/reference/solver/batch_cg_kernels.hpp.inc index b3df5ba97fd..ca88940cd69 100644 --- a/reference/solver/batch_cg_kernels.hpp.inc +++ b/reference/solver/batch_cg_kernels.hpp.inc @@ -26,10 +26,12 @@ inline void initialize( } // Compute norms of rhs - compute_norm2_kernel(b_entry, rhs_norms_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + compute_norm2_kernel(b_entry, rhs_norms_entry); // r = b - copy_kernel(b_entry, r_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel( + b_entry, r_entry); // r = b - A*x advanced_apply_kernel(static_cast(-1.0), A_entry, @@ -46,7 +48,8 @@ inline void update_p( const gko::batch::multi_vector::batch_item& p_entry) { if (rho_old_entry.values[0] == zero()) { - copy_kernel(z_entry, p_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel( + z_entry, p_entry); return; } const ValueType beta = rho_new_entry.values[0] / rho_old_entry.values[0]; @@ -67,7 +70,9 @@ inline void update_x_and_r( const gko::batch::multi_vector::batch_item& x_entry, const gko::batch::multi_vector::batch_item& r_entry) { - compute_conj_dot_product_kernel(p_entry, Ap_entry, alpha_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + compute_conj_dot_product_kernel(p_entry, Ap_entry, + alpha_entry); const ValueType temp = rho_old_entry.values[0] / alpha_entry.values[0]; for (int row = 0; row < r_entry.num_rows; row++) { @@ -154,9 +159,10 @@ inline void batch_entry_cg_impl( prec.apply(gko::batch::to_const(r_entry), z_entry); // rho_new = < r , z > = (r)' * (z) - compute_conj_dot_product_kernel( - gko::batch::to_const(r_entry), gko::batch::to_const(z_entry), - rho_new_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + compute_conj_dot_product_kernel( + gko::batch::to_const(r_entry), gko::batch::to_const(z_entry), + rho_new_entry); ++iter; // use implicit residual norms res_norms_entry.values[0] = sqrt(abs(rho_new_entry.values[0])); @@ -185,7 +191,8 @@ inline void batch_entry_cg_impl( gko::batch::to_const(Ap_entry), alpha_entry, x_entry, r_entry); // rho_old = rho_new - copy_kernel(gko::batch::to_const(rho_new_entry), rho_old_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel( + gko::batch::to_const(rho_new_entry), rho_old_entry); } logger.log_iteration(batch_item_id, iter, res_norms_entry.values[0]); From a04305292a9cec7c81bf3598dce253f731b3c4c6 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 19 Aug 2024 15:14:26 +0200 Subject: [PATCH 141/448] [kernels] remove GKO_DEVICE_NAMESPACE --- .../base/batch_multi_vector_kernels.cpp | 37 +++++++++---------- omp/base/batch_multi_vector_kernels.cpp | 16 +++----- reference/base/batch_multi_vector_kernels.cpp | 16 +++----- 3 files changed, 30 insertions(+), 39 deletions(-) diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.cpp b/common/cuda_hip/base/batch_multi_vector_kernels.cpp index 76565a83f80..17f65487464 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.cpp +++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp @@ -37,19 +37,19 @@ void scale(std::shared_ptr exec, const auto alpha_ub = get_batch_struct(alpha); const auto x_ub = get_batch_struct(x); if (alpha->get_common_size()[1] == 1) { - GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<< - num_blocks, default_block_size, 0, exec->get_stream()>>>( + batch_single_kernels::scale_kernel<<get_stream()>>>( alpha_ub, x_ub, [] __device__(int row, int col, int stride) { return 0; }); } else if (alpha->get_common_size() == x->get_common_size()) { - GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<< - num_blocks, default_block_size, 0, exec->get_stream()>>>( + batch_single_kernels::scale_kernel<<get_stream()>>>( alpha_ub, x_ub, [] __device__(int row, int col, int stride) { return row * stride + col; }); } else { - GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<< - num_blocks, default_block_size, 0, exec->get_stream()>>>( + batch_single_kernels::scale_kernel<<get_stream()>>>( alpha_ub, x_ub, [] __device__(int row, int col, int stride) { return col; }); } @@ -71,11 +71,11 @@ void add_scaled(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); if (alpha->get_common_size()[1] == 1) { - GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel<<< + batch_single_kernels::add_scaled_kernel<<< num_blocks, default_block_size, 0, exec->get_stream()>>>( alpha_ub, x_ub, y_ub, [] __device__(int col) { return 0; }); } else { - GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel<<< + batch_single_kernels::add_scaled_kernel<<< num_blocks, default_block_size, 0, exec->get_stream()>>>( alpha_ub, x_ub, y_ub, [] __device__(int col) { return col; }); } @@ -96,10 +96,9 @@ void compute_dot(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); const auto res_ub = get_batch_struct(result); - GKO_DEVICE_NAMESPACE::batch_single_kernels:: - compute_gen_dot_product_kernel<<get_stream()>>>( - x_ub, y_ub, res_ub, [] __device__(auto val) { return val; }); + batch_single_kernels::compute_gen_dot_product_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( + x_ub, y_ub, res_ub, [] __device__(auto val) { return val; }); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -117,10 +116,9 @@ void compute_conj_dot(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); const auto res_ub = get_batch_struct(result); - GKO_DEVICE_NAMESPACE::batch_single_kernels:: - compute_gen_dot_product_kernel<<get_stream()>>>( - x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); }); + batch_single_kernels::compute_gen_dot_product_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( + x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); }); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -136,8 +134,9 @@ void compute_norm2(std::shared_ptr exec, const auto num_rhs = x->get_common_size()[1]; const auto x_ub = get_batch_struct(x); const auto res_ub = get_batch_struct(result); - GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel<<< - num_blocks, default_block_size, 0, exec->get_stream()>>>(x_ub, res_ub); + batch_single_kernels::compute_norm2_kernel<<get_stream()>>>( + x_ub, res_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -152,7 +151,7 @@ void copy(std::shared_ptr exec, const auto num_blocks = x->get_num_batch_items(); const auto result_ub = get_batch_struct(result); const auto x_ub = get_batch_struct(x); - GKO_DEVICE_NAMESPACE::batch_single_kernels:: + batch_single_kernels:: copy_kernel<<get_stream()>>>( x_ub, result_ub); } diff --git a/omp/base/batch_multi_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp index 8a947107479..f740e3c32f0 100644 --- a/omp/base/batch_multi_vector_kernels.cpp +++ b/omp/base/batch_multi_vector_kernels.cpp @@ -33,7 +33,7 @@ void scale(std::shared_ptr exec, for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { const auto alpha_b = gko::batch::extract_batch_item(alpha_ub, batch); const auto x_b = gko::batch::extract_batch_item(x_ub, batch); - GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel(alpha_b, x_b); + batch_single_kernels::scale_kernel(alpha_b, x_b); } } @@ -55,8 +55,7 @@ void add_scaled(std::shared_ptr exec, const auto alpha_b = gko::batch::extract_batch_item(alpha_ub, batch); const auto x_b = gko::batch::extract_batch_item(x_ub, batch); const auto y_b = gko::batch::extract_batch_item(y_ub, batch); - GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel(alpha_b, - x_b, y_b); + batch_single_kernels::add_scaled_kernel(alpha_b, x_b, y_b); } } @@ -78,8 +77,7 @@ void compute_dot(std::shared_ptr exec, const auto res_b = gko::batch::extract_batch_item(res_ub, batch); const auto x_b = gko::batch::extract_batch_item(x_ub, batch); const auto y_b = gko::batch::extract_batch_item(y_ub, batch); - GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_dot_product_kernel( - x_b, y_b, res_b); + batch_single_kernels::compute_dot_product_kernel(x_b, y_b, res_b); } } @@ -101,8 +99,7 @@ void compute_conj_dot(std::shared_ptr exec, const auto res_b = gko::batch::extract_batch_item(res_ub, batch); const auto x_b = gko::batch::extract_batch_item(x_ub, batch); const auto y_b = gko::batch::extract_batch_item(y_ub, batch); - GKO_DEVICE_NAMESPACE::batch_single_kernels:: - compute_conj_dot_product_kernel(x_b, y_b, res_b); + batch_single_kernels::compute_conj_dot_product_kernel(x_b, y_b, res_b); } } @@ -121,8 +118,7 @@ void compute_norm2(std::shared_ptr exec, for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { const auto res_b = gko::batch::extract_batch_item(res_ub, batch); const auto x_b = gko::batch::extract_batch_item(x_ub, batch); - GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel(x_b, - res_b); + batch_single_kernels::compute_norm2_kernel(x_b, res_b); } } @@ -141,7 +137,7 @@ void copy(std::shared_ptr exec, for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { const auto result_b = gko::batch::extract_batch_item(result_ub, batch); const auto x_b = gko::batch::extract_batch_item(x_ub, batch); - GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(x_b, result_b); + batch_single_kernels::copy_kernel(x_b, result_b); } } diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp index c05398226f0..f5e1c653054 100644 --- a/reference/base/batch_multi_vector_kernels.cpp +++ b/reference/base/batch_multi_vector_kernels.cpp @@ -35,7 +35,7 @@ void scale(std::shared_ptr exec, for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { const auto alpha_b = batch::extract_batch_item(alpha_ub, batch); const auto x_b = batch::extract_batch_item(x_ub, batch); - GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel(alpha_b, x_b); + batch_single_kernels::scale_kernel(alpha_b, x_b); } } @@ -56,8 +56,7 @@ void add_scaled(std::shared_ptr exec, const auto alpha_b = batch::extract_batch_item(alpha_ub, batch); const auto x_b = batch::extract_batch_item(x_ub, batch); const auto y_b = batch::extract_batch_item(y_ub, batch); - GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel(alpha_b, - x_b, y_b); + batch_single_kernels::add_scaled_kernel(alpha_b, x_b, y_b); } } @@ -78,8 +77,7 @@ void compute_dot(std::shared_ptr exec, const auto res_b = batch::extract_batch_item(res_ub, batch); const auto x_b = batch::extract_batch_item(x_ub, batch); const auto y_b = batch::extract_batch_item(y_ub, batch); - GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_dot_product_kernel( - x_b, y_b, res_b); + batch_single_kernels::compute_dot_product_kernel(x_b, y_b, res_b); } } @@ -100,8 +98,7 @@ void compute_conj_dot(std::shared_ptr exec, const auto res_b = batch::extract_batch_item(res_ub, batch); const auto x_b = batch::extract_batch_item(x_ub, batch); const auto y_b = batch::extract_batch_item(y_ub, batch); - GKO_DEVICE_NAMESPACE::batch_single_kernels:: - compute_conj_dot_product_kernel(x_b, y_b, res_b); + batch_single_kernels::compute_conj_dot_product_kernel(x_b, y_b, res_b); } } @@ -119,8 +116,7 @@ void compute_norm2(std::shared_ptr exec, for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) { const auto res_b = batch::extract_batch_item(res_ub, batch); const auto x_b = batch::extract_batch_item(x_ub, batch); - GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel(x_b, - res_b); + batch_single_kernels::compute_norm2_kernel(x_b, res_b); } } @@ -138,7 +134,7 @@ void copy(std::shared_ptr exec, for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) { const auto result_b = batch::extract_batch_item(result_ub, batch); const auto x_b = batch::extract_batch_item(x_ub, batch); - GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(x_b, result_b); + batch_single_kernels::copy_kernel(x_b, result_b); } } From 006943b79b655bc962d9ade087cf0c172271a400 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 19 Aug 2024 15:31:01 +0200 Subject: [PATCH 142/448] [dpcpp] move to proper headers --- dpcpp/base/batch_multi_vector_kernels.dp.cpp | 64 +++++++++---------- ...hpp.inc => batch_multi_vector_kernels.hpp} | 29 +++++++++ dpcpp/solver/batch_bicgstab_kernels.dp.cpp | 2 +- dpcpp/solver/batch_bicgstab_kernels.hpp.inc | 43 ++++++++----- dpcpp/solver/batch_cg_kernels.dp.cpp | 2 +- dpcpp/solver/batch_cg_kernels.hpp.inc | 25 +++++--- 6 files changed, 102 insertions(+), 63 deletions(-) rename dpcpp/base/{batch_multi_vector_kernels.hpp.inc => batch_multi_vector_kernels.hpp} (92%) diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp index 8f607725bc8..0d2662bdccd 100644 --- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp +++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp @@ -15,6 +15,7 @@ #include "core/base/batch_struct.hpp" #include "core/components/prefix_sum_kernels.hpp" +#include "dpcpp/base/batch_multi_vector_kernels.hpp" #include "dpcpp/base/batch_struct.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" @@ -29,17 +30,9 @@ namespace gko { namespace kernels { namespace dpcpp { -/** - * @brief The MultiVector matrix format namespace. - * @ref MultiVector - * @ingroup batch_multi_vector - */ namespace batch_multi_vector { -#include "dpcpp/base/batch_multi_vector_kernels.hpp.inc" - - template void scale(std::shared_ptr exec, const batch::MultiVector* const alpha, @@ -71,7 +64,7 @@ void scale(std::shared_ptr exec, const auto alpha_b = batch::extract_batch_item(alpha_ub, group_id); const auto x_b = batch::extract_batch_item(x_ub, group_id); - scale_kernel( + batch_single_kernels::scale_kernel( alpha_b, x_b, item_ct1, [](int row, int col, int stride) { return 0; }); }); @@ -85,10 +78,11 @@ void scale(std::shared_ptr exec, const auto alpha_b = batch::extract_batch_item(alpha_ub, group_id); const auto x_b = batch::extract_batch_item(x_ub, group_id); - scale_kernel(alpha_b, x_b, item_ct1, - [](int row, int col, int stride) { - return row * stride + col; - }); + batch_single_kernels::scale_kernel( + alpha_b, x_b, item_ct1, + [](int row, int col, int stride) { + return row * stride + col; + }); }); }); } else { @@ -100,7 +94,7 @@ void scale(std::shared_ptr exec, const auto alpha_b = batch::extract_batch_item(alpha_ub, group_id); const auto x_b = batch::extract_batch_item(x_ub, group_id); - scale_kernel( + batch_single_kernels::scale_kernel( alpha_b, x_b, item_ct1, [](int row, int col, int stride) { return col; }); }); @@ -144,8 +138,9 @@ void add_scaled(std::shared_ptr exec, batch::extract_batch_item(alpha_ub, group_id); const auto x_b = batch::extract_batch_item(x_ub, group_id); const auto y_b = batch::extract_batch_item(y_ub, group_id); - add_scaled_kernel(alpha_b, x_b, y_b, item_ct1, - [](auto col) { return 0; }); + batch_single_kernels::add_scaled_kernel( + alpha_b, x_b, y_b, item_ct1, + [](auto col) { return 0; }); }); }); } else { @@ -158,8 +153,9 @@ void add_scaled(std::shared_ptr exec, batch::extract_batch_item(alpha_ub, group_id); const auto x_b = batch::extract_batch_item(x_ub, group_id); const auto y_b = batch::extract_batch_item(y_ub, group_id); - add_scaled_kernel(alpha_b, x_b, y_b, item_ct1, - [](auto col) { return col; }); + batch_single_kernels::add_scaled_kernel( + alpha_b, x_b, y_b, item_ct1, + [](auto col) { return col; }); }); }); } @@ -206,7 +202,7 @@ void compute_dot(std::shared_ptr exec, batch::extract_batch_item(y_ub, group_id); const auto res_b = batch::extract_batch_item(res_ub, group_id); - single_rhs_compute_conj_dot_sg( + batch_single_kernels::single_rhs_compute_conj_dot_sg( x_b.num_rows, x_b.values, y_b.values, res_b.values[0], item_ct1); }); @@ -226,7 +222,7 @@ void compute_dot(std::shared_ptr exec, batch::extract_batch_item(y_ub, group_id); const auto res_b = batch::extract_batch_item(res_ub, group_id); - compute_gen_dot_product_kernel( + batch_single_kernels::compute_gen_dot_product_kernel( x_b, y_b, res_b, item_ct1, [](auto val) { return val; }); }); @@ -272,7 +268,7 @@ void compute_conj_dot(std::shared_ptr exec, const auto y_b = batch::extract_batch_item(y_ub, group_id); const auto res_b = batch::extract_batch_item(res_ub, group_id); - compute_gen_dot_product_kernel( + batch_single_kernels::compute_gen_dot_product_kernel( x_b, y_b, res_b, item_ct1, [](auto val) { return conj(val); }); }); @@ -308,17 +304,16 @@ void compute_norm2(std::shared_ptr exec, exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(max_subgroup_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto x_b = - batch::extract_batch_item(x_ub, group_id); - const auto res_b = - batch::extract_batch_item(res_ub, group_id); - single_rhs_compute_norm2_sg(x_b.num_rows, x_b.values, - res_b.values[0], item_ct1); - }); + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + max_subgroup_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto x_b = batch::extract_batch_item(x_ub, group_id); + const auto res_b = + batch::extract_batch_item(res_ub, group_id); + batch_single_kernels::single_rhs_compute_norm2_sg( + x_b.num_rows, x_b.values, res_b.values[0], item_ct1); + }); }); } else { exec->get_queue()->submit([&](sycl::handler& cgh) { @@ -332,7 +327,8 @@ void compute_norm2(std::shared_ptr exec, batch::extract_batch_item(x_ub, group_id); const auto res_b = batch::extract_batch_item(res_ub, group_id); - compute_norm2_kernel(x_b, res_b, item_ct1); + batch_single_kernels::compute_norm2_kernel(x_b, res_b, + item_ct1); }); }); } @@ -371,7 +367,7 @@ void copy(std::shared_ptr exec, const auto x_b = batch::extract_batch_item(x_ub, group_id); const auto result_b = batch::extract_batch_item(result_ub, group_id); - copy_kernel(x_b, result_b, item_ct1); + batch_single_kernels::copy_kernel(x_b, result_b, item_ct1); }); }); } diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp.inc b/dpcpp/base/batch_multi_vector_kernels.hpp similarity index 92% rename from dpcpp/base/batch_multi_vector_kernels.hpp.inc rename to dpcpp/base/batch_multi_vector_kernels.hpp index c41eafd7efd..a16df237e34 100644 --- a/dpcpp/base/batch_multi_vector_kernels.hpp.inc +++ b/dpcpp/base/batch_multi_vector_kernels.hpp @@ -2,6 +2,29 @@ // // SPDX-License-Identifier: BSD-3-Clause + +#include + +#include + +#include "core/base/batch_struct.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template __dpct_inline__ void scale_kernel( const gko::batch::multi_vector::batch_item& alpha, @@ -229,3 +252,9 @@ __dpct_inline__ void copy_kernel( out.values[i * out.stride + j] = in.values[i * in.stride + j]; } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp index bb84283b49f..7dc8f3ec23b 100644 --- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp +++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp @@ -13,6 +13,7 @@ #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" +#include "dpcpp/base/batch_multi_vector_kernels.hpp" #include "dpcpp/base/batch_struct.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" @@ -36,7 +37,6 @@ namespace dpcpp { namespace batch_bicgstab { -#include "dpcpp/base/batch_multi_vector_kernels.hpp.inc" #include "dpcpp/matrix/batch_csr_kernels.hpp.inc" #include "dpcpp/matrix/batch_dense_kernels.hpp.inc" #include "dpcpp/matrix/batch_ell_kernels.hpp.inc" diff --git a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc b/dpcpp/solver/batch_bicgstab_kernels.hpp.inc index ad7eaeff556..f5a88e9d59d 100644 --- a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc +++ b/dpcpp/solver/batch_bicgstab_kernels.hpp.inc @@ -39,11 +39,13 @@ __dpct_inline__ void initialize( item_ct1.barrier(sycl::access::fence_space::global_and_local); if (sg_id == 0) { - single_rhs_compute_norm2_sg(num_rows, r_shared_entry, res_norm, - item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_norm2_sg(num_rows, r_shared_entry, res_norm, + item_ct1); } else if (sg_id == 1) { - single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norm, - item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norm, + item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); @@ -86,8 +88,9 @@ __dpct_inline__ void compute_alpha(const int num_rows, const ValueType& rho_new, const auto sg_id = sg.get_group_id(); const auto tid = item_ct1.get_local_linear_id(); if (sg_id == 0) { - single_rhs_compute_conj_dot_sg(num_rows, r_hat_shared_entry, - v_shared_entry, alpha, item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot_sg(num_rows, r_hat_shared_entry, + v_shared_entry, alpha, item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); if (tid == 0) { @@ -123,11 +126,13 @@ __dpct_inline__ void compute_omega(const int num_rows, const auto sg_id = sg.get_group_id(); const auto tid = item_ct1.get_local_linear_id(); if (sg_id == 0) { - single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry, s_shared_entry, - omega, item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry, + s_shared_entry, omega, item_ct1); } else if (sg_id == 1) { - single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry, t_shared_entry, - temp, item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry, + t_shared_entry, temp, item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); if (tid == 0) { @@ -308,8 +313,9 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf, // rho_new = < r_hat , r > = (r_hat)' * (r) if (sg_id == 0) { - single_rhs_compute_conj_dot_sg(num_rows, r_hat_sh, r_sh, - rho_new_sh[0], item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot_sg(num_rows, r_hat_sh, r_sh, + rho_new_sh[0], item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); @@ -338,8 +344,9 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf, // an estimate of residual norms if (sg_id == 0) { - single_rhs_compute_norm2_sg(num_rows, s_sh, norms_res_sh[0], - item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_norm2_sg(num_rows, s_sh, norms_res_sh[0], + item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); @@ -368,8 +375,9 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf, item_ct1.barrier(sycl::access::fence_space::global_and_local); if (sg_id == 0) - single_rhs_compute_norm2_sg(num_rows, r_sh, norms_res_sh[0], - item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_norm2_sg(num_rows, r_sh, norms_res_sh[0], + item_ct1); if (tid == group_size - 1) { rho_old_sh[0] = rho_new_sh[0]; } @@ -379,6 +387,7 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf, logger.log_iteration(batch_id, iter, norms_res_sh[0]); // copy x back to global memory - copy_kernel(num_rows, x_sh, x_global_entry, item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel( + num_rows, x_sh, x_global_entry, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); } diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp index 61591f9efb6..f25d8266803 100644 --- a/dpcpp/solver/batch_cg_kernels.dp.cpp +++ b/dpcpp/solver/batch_cg_kernels.dp.cpp @@ -13,6 +13,7 @@ #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" +#include "dpcpp/base/batch_multi_vector_kernels.hpp" #include "dpcpp/base/batch_struct.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" @@ -36,7 +37,6 @@ namespace dpcpp { namespace batch_cg { -#include "dpcpp/base/batch_multi_vector_kernels.hpp.inc" #include "dpcpp/matrix/batch_csr_kernels.hpp.inc" #include "dpcpp/matrix/batch_dense_kernels.hpp.inc" #include "dpcpp/matrix/batch_ell_kernels.hpp.inc" diff --git a/dpcpp/solver/batch_cg_kernels.hpp.inc b/dpcpp/solver/batch_cg_kernels.hpp.inc index cef6e620b64..7a91bcb2bbf 100644 --- a/dpcpp/solver/batch_cg_kernels.hpp.inc +++ b/dpcpp/solver/batch_cg_kernels.hpp.inc @@ -40,11 +40,13 @@ __dpct_inline__ void initialize( // Compute norms of rhs // and rho_old = r' * z if (sg_id == 0) { - single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norms, - item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norms, + item_ct1); } else if (sg_id == 1) { - single_rhs_compute_conj_dot_sg(num_rows, r_shared_entry, z_shared_entry, - rho_old, item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot_sg(num_rows, r_shared_entry, + z_shared_entry, rho_old, item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); @@ -80,9 +82,10 @@ __dpct_inline__ void update_x_and_r( auto sg = item_ct1.get_sub_group(); const auto tid = item_ct1.get_local_linear_id(); if (sg.get_group_id() == 0) { - single_rhs_compute_conj_dot_sg(num_rows, p_shared_entry, - Ap_shared_entry, alpha_shared_entry, - item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot_sg(num_rows, p_shared_entry, + Ap_shared_entry, alpha_shared_entry, + item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); if (tid == 0) { @@ -221,8 +224,9 @@ __dpct_inline__ void apply_kernel( // rho_new = (r)' * (z) if (sg_id == 0) { - single_rhs_compute_conj_dot_sg(num_rows, r_sh, z_sh, rho_new_sh[0], - item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot_sg(num_rows, r_sh, z_sh, + rho_new_sh[0], item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); @@ -239,6 +243,7 @@ __dpct_inline__ void apply_kernel( logger.log_iteration(batch_id, iter, norms_res_sh[0]); // copy x back to global memory - copy_kernel(num_rows, x_sh, x_global_entry, item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel( + num_rows, x_sh, x_global_entry, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); } From b1a300062876a609f6a6cf242e8ea68a32de6f38 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Tue, 20 Aug 2024 12:04:44 +0000 Subject: [PATCH 143/448] [format] Format files Co-authored-by: Pratik Nayak --- dpcpp/base/batch_multi_vector_kernels.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp b/dpcpp/base/batch_multi_vector_kernels.hpp index a16df237e34..bbcc540ae60 100644 --- a/dpcpp/base/batch_multi_vector_kernels.hpp +++ b/dpcpp/base/batch_multi_vector_kernels.hpp @@ -2,7 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause - #include #include From 5598265d1d575c7dc515c328108042865c83aaae Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Wed, 21 Aug 2024 15:42:29 +0200 Subject: [PATCH 144/448] [cuda, hip] unify csr, dense and ell kernels --- common/cuda_hip/CMakeLists.txt | 3 + .../base/batch_multi_vector_kernels.hpp | 4 + ...launcher.hpp.inc => batch_csr_kernels.cpp} | 57 ++++++++-- ..._kernels.hpp.inc => batch_csr_kernels.hpp} | 100 +++++++++-------- ...uncher.hpp.inc => batch_dense_kernels.cpp} | 60 +++++++++-- ...ernels.hpp.inc => batch_dense_kernels.hpp} | 96 +++++++++-------- ...launcher.hpp.inc => batch_ell_kernels.cpp} | 57 ++++++++-- ..._kernels.hpp.inc => batch_ell_kernels.hpp} | 101 ++++++++++-------- .../solver/batch_bicgstab_kernels.hpp.inc | 11 +- .../cuda_hip/solver/batch_cg_kernels.hpp.inc | 8 +- cuda/CMakeLists.txt | 3 - cuda/matrix/batch_csr_kernels.cu | 55 ---------- cuda/matrix/batch_struct.hpp | 8 ++ cuda/solver/batch_bicgstab_kernels.cu | 11 +- cuda/solver/batch_cg_kernels.cu | 11 +- hip/CMakeLists.txt | 3 - hip/matrix/batch_struct.hip.hpp | 7 ++ hip/solver/batch_bicgstab_kernels.hip.cpp | 9 +- hip/solver/batch_cg_kernels.hip.cpp | 9 +- 19 files changed, 356 insertions(+), 257 deletions(-) rename common/cuda_hip/matrix/{batch_csr_kernel_launcher.hpp.inc => batch_csr_kernels.cpp} (64%) rename common/cuda_hip/matrix/{batch_csr_kernels.hpp.inc => batch_csr_kernels.hpp} (66%) rename common/cuda_hip/matrix/{batch_dense_kernel_launcher.hpp.inc => batch_dense_kernels.cpp} (66%) rename common/cuda_hip/matrix/{batch_dense_kernels.hpp.inc => batch_dense_kernels.hpp} (72%) rename common/cuda_hip/matrix/{batch_ell_kernel_launcher.hpp.inc => batch_ell_kernels.cpp} (64%) rename common/cuda_hip/matrix/{batch_ell_kernels.hpp.inc => batch_ell_kernels.hpp} (67%) delete mode 100644 cuda/matrix/batch_csr_kernels.cu diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index 15d3a82419e..f5a28596d16 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -23,6 +23,9 @@ set(CUDA_HIP_SOURCES factorization/par_ilut_select_kernels.cpp factorization/par_ilut_spgeam_kernels.cpp factorization/par_ilut_sweep_kernels.cpp + matrix/batch_csr_kernels.cpp + matrix/batch_dense_kernels.cpp + matrix/batch_ell_kernels.cpp matrix/coo_kernels.cpp matrix/dense_kernels.cpp matrix/diagonal_kernels.cpp diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp index bb3aac67b55..0cbbdf9f5ee 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp @@ -35,11 +35,15 @@ namespace gko { namespace kernels { namespace GKO_DEVICE_NAMESPACE { namespace batch_single_kernels { +namespace { constexpr auto default_block_size = 256; +} + + template __device__ __forceinline__ void scale( const gko::batch::multi_vector::batch_item& alpha, diff --git a/common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_csr_kernels.cpp similarity index 64% rename from common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc rename to common/cuda_hip/matrix/batch_csr_kernels.cpp index 18c9dbcb29a..35dc2c17e03 100644 --- a/common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc +++ b/common/cuda_hip/matrix/batch_csr_kernels.cpp @@ -2,6 +2,34 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" + +#include +#include + +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_csr_kernels.hpp" +#include "core/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_csr { + + +constexpr auto default_block_size = 256; + + template void simple_apply(std::shared_ptr exec, const batch::matrix::Csr* mat, @@ -15,8 +43,9 @@ void simple_apply(std::shared_ptr exec, if (b->get_common_size()[1] > 1) { GKO_NOT_IMPLEMENTED; } - simple_apply_kernel<<get_stream()>>>(mat_ub, b_ub, x_ub); + batch_single_kernels::simple_apply_kernel<<get_stream()>>>( + mat_ub, b_ub, x_ub); } @@ -41,9 +70,9 @@ void advanced_apply(std::shared_ptr exec, if (b->get_common_size()[1] > 1) { GKO_NOT_IMPLEMENTED; } - advanced_apply_kernel<<get_stream()>>>(alpha_ub, mat_ub, b_ub, - beta_ub, x_ub); + batch_single_kernels::advanced_apply_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( + alpha_ub, mat_ub, b_ub, beta_ub, x_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( @@ -59,8 +88,10 @@ void scale(std::shared_ptr exec, const auto col_scale_vals = col_scale->get_const_data(); const auto row_scale_vals = row_scale->get_const_data(); const auto mat_ub = get_batch_struct(input); - scale_kernel<<get_stream()>>>( - as_device_type(col_scale_vals), as_device_type(row_scale_vals), mat_ub); + batch_single_kernels:: + scale_kernel<<get_stream()>>>( + as_device_type(col_scale_vals), as_device_type(row_scale_vals), + mat_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( @@ -77,10 +108,16 @@ void add_scaled_identity(std::shared_ptr exec, const auto alpha_ub = get_batch_struct(alpha); const auto beta_ub = get_batch_struct(beta); const auto mat_ub = get_batch_struct(mat); - add_scaled_identity_kernel<<get_stream()>>>(alpha_ub, beta_ub, - mat_ub); + batch_single_kernels::add_scaled_identity_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( + alpha_ub, beta_ub, mat_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL); + + +} // namespace batch_csr +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc b/common/cuda_hip/matrix/batch_csr_kernels.hpp similarity index 66% rename from common/cuda_hip/matrix/batch_csr_kernels.hpp.inc rename to common/cuda_hip/matrix/batch_csr_kernels.hpp index e041dadaa3e..32d22e435eb 100644 --- a/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/batch_csr_kernels.hpp @@ -2,6 +2,44 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include + +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" + +#if defined(GKO_COMPILING_CUDA) +#include "cuda/base/batch_struct.hpp" +#include "cuda/matrix/batch_struct.hpp" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/batch_struct.hip.hpp" +#include "hip/matrix/batch_struct.hip.hpp" +#else +#error "batch struct def missing" +#endif + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template __device__ __forceinline__ void simple_apply( const gko::batch::matrix::csr::batch_item& mat, @@ -21,23 +59,11 @@ __device__ __forceinline__ void simple_apply( } template -__global__ __launch_bounds__( - default_block_size, - sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix:: - csr::uniform_batch< - const ValueType, - IndexType> - mat, - const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - b, - const gko::batch:: - multi_vector:: - uniform_batch< - ValueType> - x) +__global__ __launch_bounds__(default_block_size) void simple_apply_kernel( + const gko::batch::matrix::csr::uniform_batch + mat, + const gko::batch::multi_vector::uniform_batch b, + const gko::batch::multi_vector::uniform_batch x) { for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; batch_id += gridDim.x) { @@ -71,33 +97,13 @@ __device__ __forceinline__ void advanced_apply( } template -__global__ __launch_bounds__( - default_block_size, - sm_oversubscription) void advanced_apply_kernel(const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - alpha, - const gko::batch::matrix:: - csr::uniform_batch< - const ValueType, - IndexType> - mat, - const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - b, - const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - beta, - const gko::batch:: - multi_vector:: - uniform_batch< - ValueType> - x) +__global__ __launch_bounds__(default_block_size) void advanced_apply_kernel( + const gko::batch::multi_vector::uniform_batch alpha, + const gko::batch::matrix::csr::uniform_batch + mat, + const gko::batch::multi_vector::uniform_batch b, + const gko::batch::multi_vector::uniform_batch beta, + const gko::batch::multi_vector::uniform_batch x) { for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; batch_id += gridDim.x) { @@ -196,3 +202,9 @@ __global__ void add_scaled_identity_kernel( add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b); } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.cpp similarity index 66% rename from common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc rename to common/cuda_hip/matrix/batch_dense_kernels.cpp index 8fdb001fd1f..44dad55aa70 100644 --- a/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc +++ b/common/cuda_hip/matrix/batch_dense_kernels.cpp @@ -2,6 +2,34 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" + +#include +#include + +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_dense_kernels.hpp" +#include "core/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_dense { + + +constexpr auto default_block_size = 256; + + template void simple_apply(std::shared_ptr exec, const batch::matrix::Dense* mat, @@ -15,8 +43,9 @@ void simple_apply(std::shared_ptr exec, if (b->get_common_size()[1] > 1) { GKO_NOT_IMPLEMENTED; } - simple_apply_kernel<<get_stream()>>>(mat_ub, b_ub, x_ub); + batch_single_kernels::simple_apply_kernel<<get_stream()>>>( + mat_ub, b_ub, x_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -40,9 +69,9 @@ void advanced_apply(std::shared_ptr exec, if (b->get_common_size()[1] > 1) { GKO_NOT_IMPLEMENTED; } - advanced_apply_kernel<<get_stream()>>>(alpha_ub, mat_ub, b_ub, - beta_ub, x_ub); + batch_single_kernels::advanced_apply_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( + alpha_ub, mat_ub, b_ub, beta_ub, x_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -58,8 +87,10 @@ void scale(std::shared_ptr exec, const auto col_scale_vals = col_scale->get_const_data(); const auto row_scale_vals = row_scale->get_const_data(); const auto mat_ub = get_batch_struct(input); - scale_kernel<<get_stream()>>>( - as_device_type(col_scale_vals), as_device_type(row_scale_vals), mat_ub); + batch_single_kernels:: + scale_kernel<<get_stream()>>>( + as_device_type(col_scale_vals), as_device_type(row_scale_vals), + mat_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL); @@ -75,7 +106,8 @@ void scale_add(std::shared_ptr exec, const auto alpha_ub = get_batch_struct(alpha); const auto mat_ub = get_batch_struct(mat); const auto in_out_ub = get_batch_struct(in_out); - scale_add_kernel<<get_stream()>>>( + batch_single_kernels::scale_add_kernel<<get_stream()>>>( alpha_ub, mat_ub, in_out_ub); } @@ -92,10 +124,16 @@ void add_scaled_identity(std::shared_ptr exec, const auto alpha_ub = get_batch_struct(alpha); const auto beta_ub = get_batch_struct(beta); const auto mat_ub = get_batch_struct(mat); - add_scaled_identity_kernel<<get_stream()>>>(alpha_ub, beta_ub, - mat_ub); + batch_single_kernels::add_scaled_identity_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( + alpha_ub, beta_ub, mat_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL); + + +} // namespace batch_dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.hpp similarity index 72% rename from common/cuda_hip/matrix/batch_dense_kernels.hpp.inc rename to common/cuda_hip/matrix/batch_dense_kernels.hpp index f8abf9131a1..74b81008b38 100644 --- a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc +++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp @@ -2,6 +2,44 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include + +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" + +#if defined(GKO_COMPILING_CUDA) +#include "cuda/base/batch_struct.hpp" +#include "cuda/matrix/batch_struct.hpp" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/batch_struct.hip.hpp" +#include "hip/matrix/batch_struct.hip.hpp" +#else +#error "batch struct def missing" +#endif + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template __device__ __forceinline__ void simple_apply( const gko::batch::matrix::dense::batch_item& mat, @@ -33,22 +71,10 @@ __device__ __forceinline__ void simple_apply( } template -__global__ __launch_bounds__( - default_block_size, - sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix:: - dense::uniform_batch< - const ValueType> - mat, - const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - b, - const gko::batch:: - multi_vector:: - uniform_batch< - ValueType> - x) +__global__ __launch_bounds__(default_block_size) void simple_apply_kernel( + const gko::batch::matrix::dense::uniform_batch mat, + const gko::batch::multi_vector::uniform_batch b, + const gko::batch::multi_vector::uniform_batch x) { for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; batch_id += gridDim.x) { @@ -94,32 +120,12 @@ __device__ __forceinline__ void advanced_apply( } template -__global__ __launch_bounds__( - default_block_size, - sm_oversubscription) void advanced_apply_kernel(const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - alpha, - const gko::batch::matrix:: - dense::uniform_batch< - const ValueType> - mat, - const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - b, - const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - beta, - const gko::batch:: - multi_vector:: - uniform_batch< - ValueType> - x) +__global__ __launch_bounds__(default_block_size) void advanced_apply_kernel( + const gko::batch::multi_vector::uniform_batch alpha, + const gko::batch::matrix::dense::uniform_batch mat, + const gko::batch::multi_vector::uniform_batch b, + const gko::batch::multi_vector::uniform_batch beta, + const gko::batch::multi_vector::uniform_batch x) { for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; batch_id += gridDim.x) { @@ -243,3 +249,9 @@ __global__ void add_scaled_identity_kernel( add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b); } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.cpp similarity index 64% rename from common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc rename to common/cuda_hip/matrix/batch_ell_kernels.cpp index 7e69b119c85..c56325ab824 100644 --- a/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc +++ b/common/cuda_hip/matrix/batch_ell_kernels.cpp @@ -2,6 +2,34 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" + +#include +#include + +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_ell_kernels.hpp" +#include "core/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_ell { + + +constexpr auto default_block_size = 256; + + template void simple_apply(std::shared_ptr exec, const batch::matrix::Ell* mat, @@ -15,8 +43,9 @@ void simple_apply(std::shared_ptr exec, if (b->get_common_size()[1] > 1) { GKO_NOT_IMPLEMENTED; } - simple_apply_kernel<<get_stream()>>>(mat_ub, b_ub, x_ub); + batch_single_kernels::simple_apply_kernel<<get_stream()>>>( + mat_ub, b_ub, x_ub); } @@ -41,9 +70,9 @@ void advanced_apply(std::shared_ptr exec, if (b->get_common_size()[1] > 1) { GKO_NOT_IMPLEMENTED; } - advanced_apply_kernel<<get_stream()>>>(alpha_ub, mat_ub, b_ub, - beta_ub, x_ub); + batch_single_kernels::advanced_apply_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( + alpha_ub, mat_ub, b_ub, beta_ub, x_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( @@ -59,8 +88,10 @@ void scale(std::shared_ptr exec, const auto col_scale_vals = col_scale->get_const_data(); const auto row_scale_vals = row_scale->get_const_data(); const auto mat_ub = get_batch_struct(input); - scale_kernel<<get_stream()>>>( - as_device_type(col_scale_vals), as_device_type(row_scale_vals), mat_ub); + batch_single_kernels:: + scale_kernel<<get_stream()>>>( + as_device_type(col_scale_vals), as_device_type(row_scale_vals), + mat_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( @@ -77,10 +108,16 @@ void add_scaled_identity(std::shared_ptr exec, const auto alpha_ub = get_batch_struct(alpha); const auto beta_ub = get_batch_struct(beta); const auto mat_ub = get_batch_struct(mat); - add_scaled_identity_kernel<<get_stream()>>>(alpha_ub, beta_ub, - mat_ub); + batch_single_kernels::add_scaled_identity_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( + alpha_ub, beta_ub, mat_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( GKO_DECLARE_BATCH_ELL_ADD_SCALED_IDENTITY_KERNEL); + + +} // namespace batch_ell +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.hpp similarity index 67% rename from common/cuda_hip/matrix/batch_ell_kernels.hpp.inc rename to common/cuda_hip/matrix/batch_ell_kernels.hpp index 0a6d1927c96..e8cadc29cd3 100644 --- a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc +++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp @@ -2,6 +2,44 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include + +#include +#include +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" + +#if defined(GKO_COMPILING_CUDA) +#include "cuda/base/batch_struct.hpp" +#include "cuda/matrix/batch_struct.hpp" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/batch_struct.hip.hpp" +#include "hip/matrix/batch_struct.hip.hpp" +#else +#error "batch struct def missing" +#endif + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template __device__ __forceinline__ void simple_apply( const gko::batch::matrix::ell::batch_item& mat, @@ -28,23 +66,11 @@ __device__ __forceinline__ void simple_apply( } template -__global__ __launch_bounds__( - default_block_size, - sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix:: - ell::uniform_batch< - const ValueType, - IndexType> - mat, - const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - b, - const gko::batch:: - multi_vector:: - uniform_batch< - ValueType> - x) +__global__ __launch_bounds__(default_block_size) void simple_apply_kernel( + const gko::batch::matrix::ell::uniform_batch + mat, + const gko::batch::multi_vector::uniform_batch b, + const gko::batch::multi_vector::uniform_batch x) { for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; batch_id += gridDim.x) { @@ -84,34 +110,15 @@ __device__ __forceinline__ void advanced_apply( } } + template -__global__ __launch_bounds__( - default_block_size, - sm_oversubscription) void advanced_apply_kernel(const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - alpha, - const gko::batch::matrix:: - ell::uniform_batch< - const ValueType, - IndexType> - mat, - const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - b, - const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - beta, - const gko::batch:: - multi_vector:: - uniform_batch< - ValueType> - x) +__global__ __launch_bounds__(default_block_size) void advanced_apply_kernel( + const gko::batch::multi_vector::uniform_batch alpha, + const gko::batch::matrix::ell::uniform_batch + mat, + const gko::batch::multi_vector::uniform_batch b, + const gko::batch::multi_vector::uniform_batch beta, + const gko::batch::multi_vector::uniform_batch x) { for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; batch_id += gridDim.x) { @@ -205,3 +212,9 @@ __global__ void add_scaled_identity_kernel( add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b); } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc index c2a53b2e518..d4ce149d394 100644 --- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc +++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc @@ -27,8 +27,9 @@ __device__ __forceinline__ void initialize( __syncthreads(); // r = b - A*x - advanced_apply(static_cast(-1.0), mat_entry, x_shared_entry, - static_cast(1.0), r_shared_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply( + static_cast(-1.0), mat_entry, x_shared_entry, + static_cast(1.0), r_shared_entry); __syncthreads(); if (threadIdx.x / config::warp_size == 0) { @@ -295,7 +296,8 @@ __global__ void apply_kernel( __syncthreads(); // v = A * p_hat - simple_apply(mat_entry, p_hat_sh, v_sh); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + simple_apply(mat_entry, p_hat_sh, v_sh); __syncthreads(); // alpha = rho_new / < r_hat , v> @@ -327,7 +329,8 @@ __global__ void apply_kernel( __syncthreads(); // t = A * s_hat - simple_apply(mat_entry, s_hat_sh, t_sh); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + simple_apply(mat_entry, s_hat_sh, t_sh); __syncthreads(); // omega = / diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc index c95a6b1cf05..4f4b382f552 100644 --- a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc +++ b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc @@ -22,8 +22,9 @@ __device__ __forceinline__ void initialize( __syncthreads(); // r = b - A*x - advanced_apply(static_cast(-1.0), mat_entry, x_shared_entry, - static_cast(1.0), r_shared_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply( + static_cast(-1.0), mat_entry, x_shared_entry, + static_cast(1.0), r_shared_entry); __syncthreads(); // z = precond * r @@ -189,7 +190,8 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf, } // Ap = A * p - simple_apply(mat_entry, p_sh, Ap_sh); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + simple_apply(mat_entry, p_sh, Ap_sh); __syncthreads(); // alpha = rho_old / (p' * Ap) diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 3631a65f48d..000cb7b215f 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -16,9 +16,6 @@ target_sources(ginkgo_cuda base/stream.cpp base/timer.cpp base/version.cpp - matrix/batch_csr_kernels.cu - matrix/batch_dense_kernels.cu - matrix/batch_ell_kernels.cu ${CSR_INSTANTIATE} ${FBCSR_INSTANTIATE} matrix/fft_kernels.cu diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu deleted file mode 100644 index 95b4f85cdfc..00000000000 --- a/cuda/matrix/batch_csr_kernels.cu +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_csr_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Csr matrix format namespace. - * @ref Csr - * @ingroup batch_csr - */ -namespace batch_csr { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_csr -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp index 5845fb2235e..8a1b8fee00a 100644 --- a/cuda/matrix/batch_struct.hpp +++ b/cuda/matrix/batch_struct.hpp @@ -9,6 +9,7 @@ #include #include +#include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" @@ -17,6 +18,13 @@ namespace gko { namespace kernels { namespace cuda { +namespace { + + +constexpr auto default_block_size = 256; + + +} /** @file batch_struct.hpp diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 4d3deb742fe..09e737c8793 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -19,6 +19,9 @@ #include "common/cuda_hip/components/reduction.hpp" #include "common/cuda_hip/components/thread_ids.hpp" #include "common/cuda_hip/components/warp_blas.hpp" +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" @@ -31,11 +34,6 @@ namespace kernels { namespace cuda { -// NOTE: this default block size is not used for the main solver kernel. -constexpr int default_block_size = 256; -constexpr int sm_oversubscription = 4; - - /** * @brief The batch Bicgstab solver namespace. * @@ -44,9 +42,6 @@ constexpr int sm_oversubscription = 4; namespace batch_bicgstab { -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc" diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index 21c3e3d43c4..7ac876de3a2 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -18,6 +18,9 @@ #include "common/cuda_hip/components/reduction.hpp" #include "common/cuda_hip/components/thread_ids.hpp" #include "common/cuda_hip/components/warp_blas.hpp" +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" @@ -30,11 +33,6 @@ namespace kernels { namespace cuda { -// NOTE: this default block size is not used for the main solver kernel. -constexpr int default_block_size = 256; -constexpr int sm_oversubscription = 4; - - /** * @brief The batch Cg solver namespace. * @@ -43,9 +41,6 @@ constexpr int sm_oversubscription = 4; namespace batch_cg { -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" #include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc" diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 84bba295120..7d914d57a81 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -14,9 +14,6 @@ set(GINKGO_HIP_SOURCES base/stream.hip.cpp base/timer.hip.cpp base/version.hip.cpp - matrix/batch_csr_kernels.hip.cpp - matrix/batch_dense_kernels.hip.cpp - matrix/batch_ell_kernels.hip.cpp ${CSR_INSTANTIATE} ${FBCSR_INSTANTIATE} preconditioner/batch_jacobi_kernels.hip.cpp diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp index bb9f7912cd6..a8d14b84bb7 100644 --- a/hip/matrix/batch_struct.hip.hpp +++ b/hip/matrix/batch_struct.hip.hpp @@ -17,6 +17,13 @@ namespace gko { namespace kernels { namespace hip { +namespace { + + +constexpr auto default_block_size = 256; + + +} /** @file batch_struct.hpp diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index 1c1be8b21f7..f0f1a715a86 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -20,6 +20,9 @@ #include "common/cuda_hip/components/reduction.hpp" #include "common/cuda_hip/components/thread_ids.hpp" #include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" @@ -32,9 +35,6 @@ namespace kernels { namespace hip { -constexpr int default_block_size = 256; -constexpr int sm_oversubscription = 4; - /** * @brief The batch Bicgstab solver namespace. * @@ -43,9 +43,6 @@ constexpr int sm_oversubscription = 4; namespace batch_bicgstab { -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc" diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index c860286c17c..b40732535f4 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -20,6 +20,9 @@ #include "common/cuda_hip/components/reduction.hpp" #include "common/cuda_hip/components/thread_ids.hpp" #include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" @@ -32,9 +35,6 @@ namespace kernels { namespace hip { -constexpr int default_block_size = 256; -constexpr int sm_oversubscription = 4; - /** * @brief The batch Cg solver namespace. * @@ -43,9 +43,6 @@ constexpr int sm_oversubscription = 4; namespace batch_cg { -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" #include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc" From 68a53e27187f2f5fda51d712f1f9425b02c730f5 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 22 Aug 2024 17:41:40 +0200 Subject: [PATCH 145/448] [ref, omp] unify csr, dense, ell kernels + also fix kernel names: remove _kernel suffix --- omp/matrix/batch_csr_kernels.cpp | 22 ++++++------- omp/matrix/batch_dense_kernels.cpp | 25 +++++++-------- omp/matrix/batch_ell_kernels.cpp | 22 ++++++------- omp/solver/batch_bicgstab_kernels.cpp | 13 ++------ omp/solver/batch_cg_kernels.cpp | 13 ++------ reference/matrix/batch_csr_kernels.cpp | 25 +++++++-------- ..._kernels.hpp.inc => batch_csr_kernels.hpp} | 29 +++++++++++++++-- reference/matrix/batch_dense_kernels.cpp | 28 ++++++++--------- ...ernels.hpp.inc => batch_dense_kernels.hpp} | 31 ++++++++++++++++--- reference/matrix/batch_ell_kernels.cpp | 25 +++++++-------- ..._kernels.hpp.inc => batch_ell_kernels.hpp} | 29 +++++++++++++++-- reference/solver/batch_bicgstab_kernels.cpp | 15 ++------- .../solver/batch_bicgstab_kernels.hpp.inc | 14 ++++----- reference/solver/batch_cg_kernels.cpp | 15 ++------- reference/solver/batch_cg_kernels.hpp.inc | 9 +++--- 15 files changed, 170 insertions(+), 145 deletions(-) rename reference/matrix/{batch_csr_kernels.hpp.inc => batch_csr_kernels.hpp} (81%) rename reference/matrix/{batch_dense_kernels.hpp.inc => batch_dense_kernels.hpp} (84%) rename reference/matrix/{batch_ell_kernels.hpp.inc => batch_ell_kernels.hpp} (84%) diff --git a/omp/matrix/batch_csr_kernels.cpp b/omp/matrix/batch_csr_kernels.cpp index eacb26c12cb..d4ea6cbd642 100644 --- a/omp/matrix/batch_csr_kernels.cpp +++ b/omp/matrix/batch_csr_kernels.cpp @@ -9,26 +9,20 @@ #include #include +#include "common/unified/base/kernel_launch.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_csr_kernels.hpp" #include "reference/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace omp { -/** - * @brief The Csr matrix format namespace. - * @ref Csr - * @ingroup batch_csr - */ namespace batch_csr { -#include "reference/matrix/batch_csr_kernels.hpp.inc" - - template void simple_apply(std::shared_ptr exec, const batch::matrix::Csr* mat, @@ -43,7 +37,7 @@ void simple_apply(std::shared_ptr exec, const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); const auto b_item = batch::extract_batch_item(b_ub, batch); const auto x_item = batch::extract_batch_item(x_ub, batch); - simple_apply_kernel(mat_item, b_item, x_item); + batch_single_kernels::simple_apply(mat_item, b_item, x_item); } } @@ -71,8 +65,9 @@ void advanced_apply(std::shared_ptr exec, const auto x_item = batch::extract_batch_item(x_ub, batch); const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); const auto beta_item = batch::extract_batch_item(beta_ub, batch); - advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, - beta_item.values[0], x_item); + batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item, + b_item, beta_item.values[0], + x_item); } } @@ -99,7 +94,7 @@ void scale(std::shared_ptr exec, const auto row_scale_b = row_scale_vals + num_rows * batch_id; const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch_id); - scale(col_scale_b, row_scale_b, mat_item); + batch_single_kernels::scale(col_scale_b, row_scale_b, mat_item); } } @@ -122,7 +117,8 @@ void add_scaled_identity(std::shared_ptr exec, const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id); const auto beta_b = batch::extract_batch_item(beta_ub, batch_id); const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id); - add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b); + batch_single_kernels::add_scaled_identity(alpha_b.values[0], + beta_b.values[0], mat_b); } } diff --git a/omp/matrix/batch_dense_kernels.cpp b/omp/matrix/batch_dense_kernels.cpp index 836908260a7..cd4a7f05b4a 100644 --- a/omp/matrix/batch_dense_kernels.cpp +++ b/omp/matrix/batch_dense_kernels.cpp @@ -9,26 +9,20 @@ #include #include +#include "common/unified/base/kernel_launch.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_dense_kernels.hpp" #include "reference/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace omp { -/** - * @brief The Dense matrix format namespace. - * @ref Dense - * @ingroup batch_dense - */ namespace batch_dense { -#include "reference/matrix/batch_dense_kernels.hpp.inc" - - template void simple_apply(std::shared_ptr exec, const batch::matrix::Dense* mat, @@ -43,7 +37,7 @@ void simple_apply(std::shared_ptr exec, const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); const auto b_item = batch::extract_batch_item(b_ub, batch); const auto x_item = batch::extract_batch_item(x_ub, batch); - simple_apply_kernel(mat_item, b_item, x_item); + batch_single_kernels::simple_apply(mat_item, b_item, x_item); } } @@ -71,8 +65,9 @@ void advanced_apply(std::shared_ptr exec, const auto x_item = batch::extract_batch_item(x_ub, batch); const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); const auto beta_item = batch::extract_batch_item(beta_ub, batch); - advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, - beta_item.values[0], x_item); + batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item, + b_item, beta_item.values[0], + x_item); } } @@ -98,7 +93,8 @@ void scale(std::shared_ptr exec, const auto row_scale_b = row_scale_vals + num_rows * batch_id; const auto input_mat = input_vals + input->get_num_elements_per_item() * batch_id; - scale(num_rows, num_cols, stride, col_scale_b, row_scale_b, input_mat); + batch_single_kernels::scale(num_rows, num_cols, stride, col_scale_b, + row_scale_b, input_mat); } } @@ -121,7 +117,7 @@ void scale_add(std::shared_ptr exec, const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id); const auto input_mat_b = batch::matrix::extract_batch_item(in_mat_ub, batch_id); - scale_add_kernel(alpha_b.values[0], mat_b, input_mat_b); + batch_single_kernels::scale_add(alpha_b.values[0], mat_b, input_mat_b); } } @@ -143,7 +139,8 @@ void add_scaled_identity(std::shared_ptr exec, const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id); const auto beta_b = batch::extract_batch_item(beta_ub, batch_id); const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id); - add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b); + batch_single_kernels::add_scaled_identity(alpha_b.values[0], + beta_b.values[0], mat_b); } } diff --git a/omp/matrix/batch_ell_kernels.cpp b/omp/matrix/batch_ell_kernels.cpp index 4fb5aeea6fa..8b1239565a1 100644 --- a/omp/matrix/batch_ell_kernels.cpp +++ b/omp/matrix/batch_ell_kernels.cpp @@ -9,26 +9,20 @@ #include #include +#include "common/unified/base/kernel_launch.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_ell_kernels.hpp" #include "reference/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace omp { -/** - * @brief The Ell matrix format namespace. - * @ref Ell - * @ingroup batch_ell - */ namespace batch_ell { -#include "reference/matrix/batch_ell_kernels.hpp.inc" - - template void simple_apply(std::shared_ptr exec, const batch::matrix::Ell* mat, @@ -43,7 +37,7 @@ void simple_apply(std::shared_ptr exec, const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); const auto b_item = batch::extract_batch_item(b_ub, batch); const auto x_item = batch::extract_batch_item(x_ub, batch); - simple_apply_kernel(mat_item, b_item, x_item); + batch_single_kernels::simple_apply(mat_item, b_item, x_item); } } @@ -71,8 +65,9 @@ void advanced_apply(std::shared_ptr exec, const auto x_item = batch::extract_batch_item(x_ub, batch); const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); const auto beta_item = batch::extract_batch_item(beta_ub, batch); - advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, - beta_item.values[0], x_item); + batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item, + b_item, beta_item.values[0], + x_item); } } @@ -99,7 +94,7 @@ void scale(std::shared_ptr exec, const auto row_scale_b = row_scale_vals + num_rows * batch_id; const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch_id); - scale(col_scale_b, row_scale_b, mat_item); + batch_single_kernels::scale(col_scale_b, row_scale_b, mat_item); } } @@ -122,7 +117,8 @@ void add_scaled_identity(std::shared_ptr exec, const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id); const auto beta_b = batch::extract_batch_item(beta_ub, batch_id); const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id); - add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b); + batch_single_kernels::add_scaled_identity(alpha_b.values[0], + beta_b.values[0], mat_b); } } diff --git a/omp/solver/batch_bicgstab_kernels.cpp b/omp/solver/batch_bicgstab_kernels.cpp index c245f284106..661cdbcd2ec 100644 --- a/omp/solver/batch_bicgstab_kernels.cpp +++ b/omp/solver/batch_bicgstab_kernels.cpp @@ -10,28 +10,21 @@ #include "core/solver/batch_dispatch.hpp" #include "reference/base/batch_multi_vector_kernels.hpp" +#include "reference/matrix/batch_csr_kernels.hpp" +#include "reference/matrix/batch_dense_kernels.hpp" +#include "reference/matrix/batch_ell_kernels.hpp" namespace gko { namespace kernels { namespace omp { -/** - * @brief The batch Bicgstab solver namespace. - * - * @ingroup batch_bicgstab - */ namespace batch_bicgstab { - - namespace { constexpr int max_num_rhs = 1; -#include "reference/matrix/batch_csr_kernels.hpp.inc" -#include "reference/matrix/batch_dense_kernels.hpp.inc" -#include "reference/matrix/batch_ell_kernels.hpp.inc" #include "reference/solver/batch_bicgstab_kernels.hpp.inc" diff --git a/omp/solver/batch_cg_kernels.cpp b/omp/solver/batch_cg_kernels.cpp index 55d6ee29321..3a6e31256c2 100644 --- a/omp/solver/batch_cg_kernels.cpp +++ b/omp/solver/batch_cg_kernels.cpp @@ -10,28 +10,21 @@ #include "core/solver/batch_dispatch.hpp" #include "reference/base/batch_multi_vector_kernels.hpp" +#include "reference/matrix/batch_csr_kernels.hpp" +#include "reference/matrix/batch_dense_kernels.hpp" +#include "reference/matrix/batch_ell_kernels.hpp" namespace gko { namespace kernels { namespace omp { -/** - * @brief The batch Cg solver namespace. - * - * @ingroup batch_cg - */ namespace batch_cg { - - namespace { constexpr int max_num_rhs = 1; -#include "reference/matrix/batch_csr_kernels.hpp.inc" -#include "reference/matrix/batch_dense_kernels.hpp.inc" -#include "reference/matrix/batch_ell_kernels.hpp.inc" #include "reference/solver/batch_cg_kernels.hpp.inc" diff --git a/reference/matrix/batch_csr_kernels.cpp b/reference/matrix/batch_csr_kernels.cpp index 7c6d9a6c000..9fbb2e35804 100644 --- a/reference/matrix/batch_csr_kernels.cpp +++ b/reference/matrix/batch_csr_kernels.cpp @@ -9,26 +9,23 @@ #include #include + +#define GKO_DEVICE_NAMESPACE reference + + #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_csr_kernels.hpp" #include "reference/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace reference { -/** - * @brief The Csr matrix format namespace. - * @ref Csr - * @ingroup batch_csr - */ namespace batch_csr { -#include "reference/matrix/batch_csr_kernels.hpp.inc" - - template void simple_apply(std::shared_ptr exec, const batch::matrix::Csr* mat, @@ -42,7 +39,7 @@ void simple_apply(std::shared_ptr exec, const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); const auto b_item = batch::extract_batch_item(b_ub, batch); const auto x_item = batch::extract_batch_item(x_ub, batch); - simple_apply_kernel(mat_item, b_item, x_item); + batch_single_kernels::simple_apply(mat_item, b_item, x_item); } } @@ -69,8 +66,9 @@ void advanced_apply(std::shared_ptr exec, const auto x_item = batch::extract_batch_item(x_ub, batch); const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); const auto beta_item = batch::extract_batch_item(beta_ub, batch); - advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, - beta_item.values[0], x_item); + batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item, + b_item, beta_item.values[0], + x_item); } } @@ -96,7 +94,7 @@ void scale(std::shared_ptr exec, const auto row_scale_b = row_scale_vals + num_rows * batch_id; const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch_id); - scale(col_scale_b, row_scale_b, mat_item); + batch_single_kernels::scale(col_scale_b, row_scale_b, mat_item); } } @@ -118,7 +116,8 @@ void add_scaled_identity(std::shared_ptr exec, const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id); const auto beta_b = batch::extract_batch_item(beta_ub, batch_id); const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id); - add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b); + batch_single_kernels::add_scaled_identity(alpha_b.values[0], + beta_b.values[0], mat_b); } } diff --git a/reference/matrix/batch_csr_kernels.hpp.inc b/reference/matrix/batch_csr_kernels.hpp similarity index 81% rename from reference/matrix/batch_csr_kernels.hpp.inc rename to reference/matrix/batch_csr_kernels.hpp index 52e511785a0..e04b2bdf345 100644 --- a/reference/matrix/batch_csr_kernels.hpp.inc +++ b/reference/matrix/batch_csr_kernels.hpp @@ -2,8 +2,25 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + +#include +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template -inline void simple_apply_kernel( +inline void simple_apply( const gko::batch::matrix::csr::batch_item& a, const gko::batch::multi_vector::batch_item& b, const gko::batch::multi_vector::batch_item& c) @@ -25,7 +42,7 @@ inline void simple_apply_kernel( template -inline void advanced_apply_kernel( +inline void advanced_apply( const ValueType alpha, const gko::batch::matrix::csr::batch_item& a, const gko::batch::multi_vector::batch_item& b, @@ -63,7 +80,7 @@ inline void scale( template -inline void add_scaled_identity_kernel( +inline void add_scaled_identity( const ValueType alpha, const ValueType beta, const gko::batch::matrix::csr::batch_item& mat) { @@ -76,3 +93,9 @@ inline void add_scaled_identity_kernel( } } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/reference/matrix/batch_dense_kernels.cpp b/reference/matrix/batch_dense_kernels.cpp index 2116a691fb9..99a7d4e8d7b 100644 --- a/reference/matrix/batch_dense_kernels.cpp +++ b/reference/matrix/batch_dense_kernels.cpp @@ -9,26 +9,23 @@ #include #include + +#define GKO_DEVICE_NAMESPACE reference + + #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_dense_kernels.hpp" #include "reference/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace reference { -/** - * @brief The Dense matrix format namespace. - * @ref Dense - * @ingroup batch_dense - */ namespace batch_dense { -#include "reference/matrix/batch_dense_kernels.hpp.inc" - - template void simple_apply(std::shared_ptr exec, const batch::matrix::Dense* mat, @@ -42,7 +39,7 @@ void simple_apply(std::shared_ptr exec, const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); const auto b_item = batch::extract_batch_item(b_ub, batch); const auto x_item = batch::extract_batch_item(x_ub, batch); - simple_apply_kernel(mat_item, b_item, x_item); + batch_single_kernels::simple_apply(mat_item, b_item, x_item); } } @@ -69,8 +66,9 @@ void advanced_apply(std::shared_ptr exec, const auto x_item = batch::extract_batch_item(x_ub, batch); const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); const auto beta_item = batch::extract_batch_item(beta_ub, batch); - advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, - beta_item.values[0], x_item); + batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item, + b_item, beta_item.values[0], + x_item); } } @@ -95,7 +93,8 @@ void scale(std::shared_ptr exec, const auto row_scale_b = row_scale_vals + num_rows * batch_id; const auto input_mat = input_vals + input->get_num_elements_per_item() * batch_id; - scale(num_rows, num_cols, stride, col_scale_b, row_scale_b, input_mat); + batch_single_kernels::scale(num_rows, num_cols, stride, col_scale_b, + row_scale_b, input_mat); } } @@ -117,7 +116,7 @@ void scale_add(std::shared_ptr exec, const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id); const auto input_mat_b = batch::matrix::extract_batch_item(in_mat_ub, batch_id); - scale_add_kernel(alpha_b.values[0], mat_b, input_mat_b); + batch_single_kernels::scale_add(alpha_b.values[0], mat_b, input_mat_b); } } @@ -138,7 +137,8 @@ void add_scaled_identity(std::shared_ptr exec, const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id); const auto beta_b = batch::extract_batch_item(beta_ub, batch_id); const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id); - add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b); + batch_single_kernels::add_scaled_identity(alpha_b.values[0], + beta_b.values[0], mat_b); } } diff --git a/reference/matrix/batch_dense_kernels.hpp.inc b/reference/matrix/batch_dense_kernels.hpp similarity index 84% rename from reference/matrix/batch_dense_kernels.hpp.inc rename to reference/matrix/batch_dense_kernels.hpp index a017010a644..e12827c77de 100644 --- a/reference/matrix/batch_dense_kernels.hpp.inc +++ b/reference/matrix/batch_dense_kernels.hpp @@ -2,8 +2,25 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + +#include +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template -inline void simple_apply_kernel( +inline void simple_apply( const gko::batch::matrix::dense::batch_item& a, const gko::batch::multi_vector::batch_item& b, const gko::batch::multi_vector::batch_item& c) @@ -27,7 +44,7 @@ inline void simple_apply_kernel( template -inline void advanced_apply_kernel( +inline void advanced_apply( const ValueType alpha, const gko::batch::matrix::dense::batch_item& a, const gko::batch::multi_vector::batch_item& b, @@ -75,7 +92,7 @@ inline void scale(const int num_rows, const int num_cols, template -inline void scale_add_kernel( +inline void scale_add( const ValueType alpha, const gko::batch::matrix::dense::batch_item& b, const gko::batch::matrix::dense::batch_item& in_out) @@ -91,7 +108,7 @@ inline void scale_add_kernel( template -inline void add_scaled_identity_kernel( +inline void add_scaled_identity( const ValueType alpha, const ValueType beta, const gko::batch::matrix::dense::batch_item& mat) { @@ -105,3 +122,9 @@ inline void add_scaled_identity_kernel( } } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/reference/matrix/batch_ell_kernels.cpp b/reference/matrix/batch_ell_kernels.cpp index 0d47f9ea601..7772662b216 100644 --- a/reference/matrix/batch_ell_kernels.cpp +++ b/reference/matrix/batch_ell_kernels.cpp @@ -9,26 +9,23 @@ #include #include + +#define GKO_DEVICE_NAMESPACE reference + + #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_ell_kernels.hpp" #include "reference/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace reference { -/** - * @brief The Ell matrix format namespace. - * @ref Ell - * @ingroup batch_ell - */ namespace batch_ell { -#include "reference/matrix/batch_ell_kernels.hpp.inc" - - template void simple_apply(std::shared_ptr exec, const batch::matrix::Ell* mat, @@ -42,7 +39,7 @@ void simple_apply(std::shared_ptr exec, const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch); const auto b_item = batch::extract_batch_item(b_ub, batch); const auto x_item = batch::extract_batch_item(x_ub, batch); - simple_apply_kernel(mat_item, b_item, x_item); + batch_single_kernels::simple_apply(mat_item, b_item, x_item); } } @@ -69,8 +66,9 @@ void advanced_apply(std::shared_ptr exec, const auto x_item = batch::extract_batch_item(x_ub, batch); const auto alpha_item = batch::extract_batch_item(alpha_ub, batch); const auto beta_item = batch::extract_batch_item(beta_ub, batch); - advanced_apply_kernel(alpha_item.values[0], mat_item, b_item, - beta_item.values[0], x_item); + batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item, + b_item, beta_item.values[0], + x_item); } } @@ -96,7 +94,7 @@ void scale(std::shared_ptr exec, const auto row_scale_b = row_scale_vals + num_rows * batch_id; const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch_id); - scale(col_scale_b, row_scale_b, mat_item); + batch_single_kernels::scale(col_scale_b, row_scale_b, mat_item); } } @@ -118,7 +116,8 @@ void add_scaled_identity(std::shared_ptr exec, const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id); const auto beta_b = batch::extract_batch_item(beta_ub, batch_id); const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id); - add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b); + batch_single_kernels::add_scaled_identity(alpha_b.values[0], + beta_b.values[0], mat_b); } } diff --git a/reference/matrix/batch_ell_kernels.hpp.inc b/reference/matrix/batch_ell_kernels.hpp similarity index 84% rename from reference/matrix/batch_ell_kernels.hpp.inc rename to reference/matrix/batch_ell_kernels.hpp index 7aea0946573..71bd1ce851a 100644 --- a/reference/matrix/batch_ell_kernels.hpp.inc +++ b/reference/matrix/batch_ell_kernels.hpp @@ -2,8 +2,25 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + +#include +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template -inline void simple_apply_kernel( +inline void simple_apply( const gko::batch::matrix::ell::batch_item& a, const gko::batch::multi_vector::batch_item& b, const gko::batch::multi_vector::batch_item& c) @@ -27,7 +44,7 @@ inline void simple_apply_kernel( template -inline void advanced_apply_kernel( +inline void advanced_apply( const ValueType alpha, const gko::batch::matrix::ell::batch_item& a, const gko::batch::multi_vector::batch_item& b, @@ -73,7 +90,7 @@ inline void scale( template -inline void add_scaled_identity_kernel( +inline void add_scaled_identity( const ValueType alpha, const ValueType beta, const gko::batch::matrix::ell::batch_item& mat) { @@ -91,3 +108,9 @@ inline void add_scaled_identity_kernel( } } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/reference/solver/batch_bicgstab_kernels.cpp b/reference/solver/batch_bicgstab_kernels.cpp index e68caffa936..33e1e9392d9 100644 --- a/reference/solver/batch_bicgstab_kernels.cpp +++ b/reference/solver/batch_bicgstab_kernels.cpp @@ -6,30 +6,21 @@ #include "core/solver/batch_dispatch.hpp" #include "reference/base/batch_multi_vector_kernels.hpp" +#include "reference/matrix/batch_csr_kernels.hpp" +#include "reference/matrix/batch_dense_kernels.hpp" +#include "reference/matrix/batch_ell_kernels.hpp" namespace gko { namespace kernels { namespace reference { - - -/** - * @brief The batch Bicgstab solver namespace. - * - * @ingroup batch_bicgstab - */ namespace batch_bicgstab { - - namespace { constexpr int max_num_rhs = 1; -#include "reference/matrix/batch_csr_kernels.hpp.inc" -#include "reference/matrix/batch_dense_kernels.hpp.inc" -#include "reference/matrix/batch_ell_kernels.hpp.inc" #include "reference/solver/batch_bicgstab_kernels.hpp.inc" diff --git a/reference/solver/batch_bicgstab_kernels.hpp.inc b/reference/solver/batch_bicgstab_kernels.hpp.inc index 1f8537ab66d..786e98eb5d1 100644 --- a/reference/solver/batch_bicgstab_kernels.hpp.inc +++ b/reference/solver/batch_bicgstab_kernels.hpp.inc @@ -33,9 +33,9 @@ inline void initialize( b_entry, r_entry); // r = b - A*x - advanced_apply_kernel(static_cast(-1.0), A_entry, - gko::batch::to_const(x_entry), - static_cast(1.0), r_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply( + static_cast(-1.0), A_entry, gko::batch::to_const(x_entry), + static_cast(1.0), r_entry); gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: compute_norm2_kernel(gko::batch::to_const(r_entry), res_norms_entry); @@ -271,8 +271,8 @@ inline void batch_entry_bicgstab_impl( prec.apply(gko::batch::to_const(p_entry), p_hat_entry); // v = A * p_hat - simple_apply_kernel(A_entry, gko::batch::to_const(p_hat_entry), - v_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply( + A_entry, gko::batch::to_const(p_hat_entry), v_entry); // alpha = rho_new / < r_hat , v> compute_alpha(gko::batch::to_const(rho_new_entry), @@ -303,8 +303,8 @@ inline void batch_entry_bicgstab_impl( prec.apply(gko::batch::to_const(s_entry), s_hat_entry); // t = A * s_hat - simple_apply_kernel(A_entry, gko::batch::to_const(s_hat_entry), - t_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply( + A_entry, gko::batch::to_const(s_hat_entry), t_entry); // omega = / compute_omega(gko::batch::to_const(t_entry), gko::batch::to_const(s_entry), temp_entry, omega_entry); diff --git a/reference/solver/batch_cg_kernels.cpp b/reference/solver/batch_cg_kernels.cpp index 785a7a868a2..7c69157d4a7 100644 --- a/reference/solver/batch_cg_kernels.cpp +++ b/reference/solver/batch_cg_kernels.cpp @@ -6,30 +6,21 @@ #include "core/solver/batch_dispatch.hpp" #include "reference/base/batch_multi_vector_kernels.hpp" +#include "reference/matrix/batch_csr_kernels.hpp" +#include "reference/matrix/batch_dense_kernels.hpp" +#include "reference/matrix/batch_ell_kernels.hpp" namespace gko { namespace kernels { namespace reference { - - -/** - * @brief The batch Cg solver namespace. - * - * @ingroup batch_cg - */ namespace batch_cg { - - namespace { constexpr int max_num_rhs = 1; -#include "reference/matrix/batch_csr_kernels.hpp.inc" -#include "reference/matrix/batch_dense_kernels.hpp.inc" -#include "reference/matrix/batch_ell_kernels.hpp.inc" #include "reference/solver/batch_cg_kernels.hpp.inc" diff --git a/reference/solver/batch_cg_kernels.hpp.inc b/reference/solver/batch_cg_kernels.hpp.inc index ca88940cd69..991db5c061c 100644 --- a/reference/solver/batch_cg_kernels.hpp.inc +++ b/reference/solver/batch_cg_kernels.hpp.inc @@ -34,9 +34,9 @@ inline void initialize( b_entry, r_entry); // r = b - A*x - advanced_apply_kernel(static_cast(-1.0), A_entry, - gko::batch::to_const(x_entry), - static_cast(1.0), r_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply( + static_cast(-1.0), A_entry, gko::batch::to_const(x_entry), + static_cast(1.0), r_entry); } @@ -181,7 +181,8 @@ inline void batch_entry_cg_impl( gko::batch::to_const(z_entry), p_entry); // Ap = A * p - simple_apply_kernel(A_entry, gko::batch::to_const(p_entry), Ap_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply( + A_entry, gko::batch::to_const(p_entry), Ap_entry); // temp= rho_old / (p' * Ap) // x = x + temp * p From 4d6756fd7690ec508c3ff6d693af4ee5f377ad13 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 22 Aug 2024 17:57:03 +0200 Subject: [PATCH 146/448] [dpcpp] unify dpcpp kernels --- dpcpp/matrix/batch_csr_kernels.dp.cpp | 28 ++++------- ..._kernels.hpp.inc => batch_csr_kernels.hpp} | 48 ++++++++++++++---- dpcpp/matrix/batch_dense_kernels.dp.cpp | 50 +++++++++---------- ...ernels.hpp.inc => batch_dense_kernels.hpp} | 40 +++++++++++++-- dpcpp/matrix/batch_ell_kernels.dp.cpp | 24 ++++----- ..._kernels.hpp.inc => batch_ell_kernels.hpp} | 38 ++++++++++++-- dpcpp/solver/batch_bicgstab_kernels.dp.cpp | 11 ++-- dpcpp/solver/batch_bicgstab_kernels.hpp.inc | 12 +++-- dpcpp/solver/batch_cg_kernels.dp.cpp | 11 ++-- dpcpp/solver/batch_cg_kernels.hpp.inc | 9 ++-- 10 files changed, 168 insertions(+), 103 deletions(-) rename dpcpp/matrix/{batch_csr_kernels.hpp.inc => batch_csr_kernels.hpp} (67%) rename dpcpp/matrix/{batch_dense_kernels.hpp.inc => batch_dense_kernels.hpp} (84%) rename dpcpp/matrix/{batch_ell_kernels.hpp.inc => batch_ell_kernels.hpp} (78%) diff --git a/dpcpp/matrix/batch_csr_kernels.dp.cpp b/dpcpp/matrix/batch_csr_kernels.dp.cpp index 9feb824a3aa..1759a959299 100644 --- a/dpcpp/matrix/batch_csr_kernels.dp.cpp +++ b/dpcpp/matrix/batch_csr_kernels.dp.cpp @@ -21,23 +21,16 @@ #include "dpcpp/components/intrinsics.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_csr_kernels.hpp" #include "dpcpp/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace dpcpp { -/** - * @brief The Csr matrix format namespace. - * @ref Csr - * @ingroup batch_csr - */ namespace batch_csr { -#include "dpcpp/matrix/batch_csr_kernels.hpp.inc" - - template void simple_apply(std::shared_ptr exec, const batch::matrix::Csr* mat, @@ -74,8 +67,8 @@ void simple_apply(std::shared_ptr exec, batch::matrix::extract_batch_item(mat_ub, group_id); const auto b_b = batch::extract_batch_item(b_ub, group_id); const auto x_b = batch::extract_batch_item(x_ub, group_id); - simple_apply_kernel(mat_b, b_b.values, x_b.values, - item_ct1); + batch_single_kernels::simple_apply(mat_b, b_b.values, + x_b.values, item_ct1); }); }); } @@ -127,9 +120,9 @@ void advanced_apply(std::shared_ptr exec, batch::extract_batch_item(alpha_ub, group_id); const auto beta_b = batch::extract_batch_item(beta_ub, group_id); - advanced_apply_kernel(alpha_b.values[0], mat_b, b_b.values, - beta_b.values[0], x_b.values, - item_ct1); + batch_single_kernels::advanced_apply( + alpha_b.values[0], mat_b, b_b.values, beta_b.values[0], + x_b.values, item_ct1); }); }); } @@ -172,9 +165,10 @@ void scale(std::shared_ptr exec, row_scale_vals + num_rows * group_id; const auto mat_item = batch::matrix::extract_batch_item(mat_ub, group_id); - scale_kernel(mat_item.num_rows, col_scale_b, row_scale_b, - mat_item.col_idxs, mat_item.row_ptrs, - mat_item.values, item_ct1); + batch_single_kernels::scale(mat_item.num_rows, col_scale_b, + row_scale_b, mat_item.col_idxs, + mat_item.row_ptrs, + mat_item.values, item_ct1); }); }); } @@ -215,7 +209,7 @@ void add_scaled_identity(std::shared_ptr exec, gko::batch::extract_batch_item(beta_ub, group_id); const auto mat_b = gko::batch::matrix::extract_batch_item( mat_ub, group_id); - add_scaled_identity_kernel( + batch_single_kernels::add_scaled_identity( alpha_b.values[0], beta_b.values[0], mat_b, item_ct1); }); }); diff --git a/dpcpp/matrix/batch_csr_kernels.hpp.inc b/dpcpp/matrix/batch_csr_kernels.hpp similarity index 67% rename from dpcpp/matrix/batch_csr_kernels.hpp.inc rename to dpcpp/matrix/batch_csr_kernels.hpp index 4379e02d0b7..f51124f81a4 100644 --- a/dpcpp/matrix/batch_csr_kernels.hpp.inc +++ b/dpcpp/matrix/batch_csr_kernels.hpp @@ -2,8 +2,32 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template -__dpct_inline__ void simple_apply_kernel( +__dpct_inline__ void simple_apply( const gko::batch::matrix::csr::batch_item& mat, const ValueType* b, ValueType* x, sycl::nd_item<3>& item_ct1) { @@ -23,7 +47,7 @@ __dpct_inline__ void simple_apply_kernel( template -__dpct_inline__ void advanced_apply_kernel( +__dpct_inline__ void advanced_apply( const ValueType alpha, const gko::batch::matrix::csr::batch_item& mat, const ValueType* b, const ValueType beta, ValueType* x, @@ -45,13 +69,11 @@ __dpct_inline__ void advanced_apply_kernel( template -__dpct_inline__ void scale_kernel(const int num_rows, - const ValueType* const col_scale, - const ValueType* const row_scale, - const IndexType* const col_idxs, - const IndexType* const row_ptrs, - ValueType* const values, - sycl::nd_item<3>& item_ct1) +__dpct_inline__ void scale(const int num_rows, const ValueType* const col_scale, + const ValueType* const row_scale, + const IndexType* const col_idxs, + const IndexType* const row_ptrs, + ValueType* const values, sycl::nd_item<3>& item_ct1) { for (int row = item_ct1.get_local_linear_id(); row < num_rows; row += item_ct1.get_local_range().size()) { @@ -64,7 +86,7 @@ __dpct_inline__ void scale_kernel(const int num_rows, template -__dpct_inline__ void add_scaled_identity_kernel( +__dpct_inline__ void add_scaled_identity( const ValueType alpha, const ValueType beta, const gko::batch::matrix::csr::batch_item& mat, sycl::nd_item<3>& item_ct1) @@ -80,3 +102,9 @@ __dpct_inline__ void add_scaled_identity_kernel( } } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp index a9f6afce0f5..2cebbe326e8 100644 --- a/dpcpp/matrix/batch_dense_kernels.dp.cpp +++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp @@ -25,23 +25,16 @@ #include "dpcpp/components/intrinsics.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_dense_kernels.hpp" #include "dpcpp/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace dpcpp { -/** - * @brief The Dense matrix format namespace. - * - * @ingroup batch_dense - */ namespace batch_dense { -#include "dpcpp/matrix/batch_dense_kernels.hpp.inc" - - template void simple_apply(std::shared_ptr exec, const batch::matrix::Dense* mat, @@ -77,8 +70,8 @@ void simple_apply(std::shared_ptr exec, batch::matrix::extract_batch_item(mat_ub, group_id); const auto b_b = batch::extract_batch_item(b_ub, group_id); const auto x_b = batch::extract_batch_item(x_ub, group_id); - simple_apply_kernel(mat_b, b_b.values, x_b.values, - item_ct1); + batch_single_kernels::simple_apply_kernel( + mat_b, b_b.values, x_b.values, item_ct1); }); }); } @@ -129,9 +122,9 @@ void advanced_apply(std::shared_ptr exec, batch::extract_batch_item(alpha_ub, group_id); const auto beta_b = batch::extract_batch_item(beta_ub, group_id); - advanced_apply_kernel(alpha_b.values[0], mat_b, b_b.values, - beta_b.values[0], x_b.values, - item_ct1); + batch_single_kernels::advanced_apply( + alpha_b.values[0], mat_b, b_b.values, beta_b.values[0], + x_b.values, item_ct1); }); }); } @@ -174,7 +167,8 @@ void scale(std::shared_ptr exec, row_scale_vals + num_rows * group_id; auto input_mat = batch::matrix::extract_batch_item(mat_ub, group_id); - scale_kernel(col_scale_b, row_scale_b, input_mat, item_ct1); + batch_single_kernels::scale(col_scale_b, row_scale_b, + input_mat, item_ct1); }); }); } @@ -204,18 +198,20 @@ void scale_add(std::shared_ptr exec, exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - auto group = item_ct1.get_group(); - auto group_id = group.get_group_linear_id(); - const auto alpha_b = - gko::batch::extract_batch_item(alpha_ub, group_id); - const auto mat_b = - gko::batch::matrix::extract_batch_item(mat_ub, group_id); - const auto in_out_b = - gko::batch::matrix::extract_batch_item(in_out_ub, group_id); - scale_add_kernel(alpha_b.values[0], mat_b, in_out_b, item_ct1); - }); + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + auto group = item_ct1.get_group(); + auto group_id = group.get_group_linear_id(); + const auto alpha_b = + gko::batch::extract_batch_item(alpha_ub, group_id); + const auto mat_b = gko::batch::matrix::extract_batch_item( + mat_ub, group_id); + const auto in_out_b = + gko::batch::matrix::extract_batch_item(in_out_ub, + group_id); + batch_single_kernels::scale_add(alpha_b.values[0], mat_b, + in_out_b, item_ct1); + }); }); } @@ -254,7 +250,7 @@ void add_scaled_identity(std::shared_ptr exec, gko::batch::extract_batch_item(beta_ub, group_id); const auto mat_b = gko::batch::matrix::extract_batch_item( mat_ub, group_id); - add_scaled_identity_kernel( + batch_single_kernels::add_scaled_identity( alpha_b.values[0], beta_b.values[0], mat_b, item_ct1); }); }); diff --git a/dpcpp/matrix/batch_dense_kernels.hpp.inc b/dpcpp/matrix/batch_dense_kernels.hpp similarity index 84% rename from dpcpp/matrix/batch_dense_kernels.hpp.inc rename to dpcpp/matrix/batch_dense_kernels.hpp index 98282fe253d..acf1e65939d 100644 --- a/dpcpp/matrix/batch_dense_kernels.hpp.inc +++ b/dpcpp/matrix/batch_dense_kernels.hpp @@ -2,8 +2,32 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template -__dpct_inline__ void simple_apply_kernel( +__dpct_inline__ void simple_apply( const gko::batch::matrix::dense::batch_item& mat, const ValueType* b, ValueType* x, sycl::nd_item<3>& item_ct1) { @@ -34,7 +58,7 @@ __dpct_inline__ void simple_apply_kernel( template -__dpct_inline__ void advanced_apply_kernel( +__dpct_inline__ void advanced_apply( const ValueType alpha, const gko::batch::matrix::dense::batch_item& mat, const ValueType* b, const ValueType beta, ValueType* x, @@ -67,7 +91,7 @@ __dpct_inline__ void advanced_apply_kernel( template -__dpct_inline__ void scale_kernel( +__dpct_inline__ void scale( const ValueType* const col_scale, const ValueType* const row_scale, gko::batch::matrix::dense::batch_item& mat, sycl::nd_item<3>& item_ct1) @@ -91,7 +115,7 @@ __dpct_inline__ void scale_kernel( template -__dpct_inline__ void scale_add_kernel( +__dpct_inline__ void scale_add( const ValueType alpha, const gko::batch::matrix::dense::batch_item& mat, const gko::batch::matrix::dense::batch_item& in_out, @@ -117,7 +141,7 @@ __dpct_inline__ void scale_add_kernel( template -__dpct_inline__ void add_scaled_identity_kernel( +__dpct_inline__ void add_scaled_identity( const ValueType alpha, const ValueType beta, const gko::batch::matrix::dense::batch_item& mat, sycl::nd_item<3>& item_ct1) @@ -140,3 +164,9 @@ __dpct_inline__ void add_scaled_identity_kernel( } } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp index 2cb40dc35eb..d9b819b101e 100644 --- a/dpcpp/matrix/batch_ell_kernels.dp.cpp +++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp @@ -21,23 +21,16 @@ #include "dpcpp/components/intrinsics.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_ell_kernels.hpp" #include "dpcpp/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace dpcpp { -/** - * @brief The Ell matrix format namespace. - * @ref Ell - * @ingroup batch_ell - */ namespace batch_ell { -#include "dpcpp/matrix/batch_ell_kernels.hpp.inc" - - template void simple_apply(std::shared_ptr exec, const batch::matrix::Ell* mat, @@ -74,8 +67,8 @@ void simple_apply(std::shared_ptr exec, batch::matrix::extract_batch_item(mat_ub, group_id); const auto b_b = batch::extract_batch_item(b_ub, group_id); const auto x_b = batch::extract_batch_item(x_ub, group_id); - simple_apply_kernel(mat_b, b_b.values, x_b.values, - item_ct1); + batch_single_kernels::simple_apply(mat_b, b_b.values, + x_b.values, item_ct1); }); }); } @@ -127,9 +120,9 @@ void advanced_apply(std::shared_ptr exec, batch::extract_batch_item(alpha_ub, group_id); const auto beta_b = batch::extract_batch_item(beta_ub, group_id); - advanced_apply_kernel(alpha_b.values[0], mat_b, b_b.values, - beta_b.values[0], x_b.values, - item_ct1); + batch_single_kernels::advanced_apply( + alpha_b.values[0], mat_b, b_b.values, beta_b.values[0], + x_b.values, item_ct1); }); }); } @@ -171,7 +164,8 @@ void scale(std::shared_ptr exec, row_scale_vals + num_rows * group_id; auto mat_item = batch::matrix::extract_batch_item(mat_ub, group_id); - scale_kernel(col_scale_b, row_scale_b, mat_item, item_ct1); + batch_single_kernels::scale(col_scale_b, row_scale_b, + mat_item, item_ct1); }); }); } @@ -212,7 +206,7 @@ void add_scaled_identity(std::shared_ptr exec, gko::batch::extract_batch_item(beta_ub, group_id); const auto mat_b = gko::batch::matrix::extract_batch_item( mat_ub, group_id); - add_scaled_identity_kernel( + batch_single_kernels::add_scaled_identity( alpha_b.values[0], beta_b.values[0], mat_b, item_ct1); }); }); diff --git a/dpcpp/matrix/batch_ell_kernels.hpp.inc b/dpcpp/matrix/batch_ell_kernels.hpp similarity index 78% rename from dpcpp/matrix/batch_ell_kernels.hpp.inc rename to dpcpp/matrix/batch_ell_kernels.hpp index 1a809664dca..48ab9318bdf 100644 --- a/dpcpp/matrix/batch_ell_kernels.hpp.inc +++ b/dpcpp/matrix/batch_ell_kernels.hpp @@ -2,8 +2,32 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template -__dpct_inline__ void simple_apply_kernel( +__dpct_inline__ void simple_apply( const gko::batch::matrix::ell::batch_item& mat, const ValueType* b, ValueType* x, sycl::nd_item<3>& item_ct1) { @@ -24,7 +48,7 @@ __dpct_inline__ void simple_apply_kernel( template -__dpct_inline__ void advanced_apply_kernel( +__dpct_inline__ void advanced_apply( const ValueType alpha, const gko::batch::matrix::ell::batch_item& mat, const ValueType* b, const ValueType beta, ValueType* x, @@ -47,7 +71,7 @@ __dpct_inline__ void advanced_apply_kernel( template -__dpct_inline__ void scale_kernel( +__dpct_inline__ void scale( const ValueType* const col_scale, const ValueType* const row_scale, gko::batch::matrix::ell::batch_item& mat, sycl::nd_item<3>& item_ct1) @@ -69,7 +93,7 @@ __dpct_inline__ void scale_kernel( template -__dpct_inline__ void add_scaled_identity_kernel( +__dpct_inline__ void add_scaled_identity( const ValueType alpha, const ValueType beta, const gko::batch::matrix::ell::batch_item& mat, sycl::nd_item<3>& item_ct1) @@ -89,3 +113,9 @@ __dpct_inline__ void add_scaled_identity_kernel( } } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp index 7dc8f3ec23b..291ee1d8a8b 100644 --- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp +++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp @@ -23,23 +23,18 @@ #include "dpcpp/components/intrinsics.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_csr_kernels.hpp" +#include "dpcpp/matrix/batch_dense_kernels.hpp" +#include "dpcpp/matrix/batch_ell_kernels.hpp" #include "dpcpp/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace dpcpp { -/** - * @brief The batch Bicgstab solver namespace. - * - * @ingroup batch_bicgstab - */ namespace batch_bicgstab { -#include "dpcpp/matrix/batch_csr_kernels.hpp.inc" -#include "dpcpp/matrix/batch_dense_kernels.hpp.inc" -#include "dpcpp/matrix/batch_ell_kernels.hpp.inc" #include "dpcpp/solver/batch_bicgstab_kernels.hpp.inc" diff --git a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc b/dpcpp/solver/batch_bicgstab_kernels.hpp.inc index f5a88e9d59d..de1956c8c6c 100644 --- a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc +++ b/dpcpp/solver/batch_bicgstab_kernels.hpp.inc @@ -33,9 +33,9 @@ __dpct_inline__ void initialize( item_ct1.barrier(sycl::access::fence_space::global_and_local); // r = b - A*x - advanced_apply_kernel(static_cast(-1.0), mat_global_entry, - x_shared_entry, static_cast(1.0), - r_shared_entry, item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply( + static_cast(-1.0), mat_global_entry, x_shared_entry, + static_cast(1.0), r_shared_entry, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); if (sg_id == 0) { @@ -330,7 +330,8 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf, item_ct1.barrier(sycl::access::fence_space::global_and_local); // v = A * p_hat - simple_apply_kernel(mat_global_entry, p_hat_sh, v_sh, item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply( + mat_global_entry, p_hat_sh, v_sh, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); // alpha = rho_new / < r_hat , v> @@ -361,7 +362,8 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf, item_ct1.barrier(sycl::access::fence_space::global_and_local); // t = A * s_hat - simple_apply_kernel(mat_global_entry, s_hat_sh, t_sh, item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply( + mat_global_entry, s_hat_sh, t_sh, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); // omega = / diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp index f25d8266803..05b3f7b803c 100644 --- a/dpcpp/solver/batch_cg_kernels.dp.cpp +++ b/dpcpp/solver/batch_cg_kernels.dp.cpp @@ -23,23 +23,18 @@ #include "dpcpp/components/intrinsics.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_csr_kernels.hpp" +#include "dpcpp/matrix/batch_dense_kernels.hpp" +#include "dpcpp/matrix/batch_ell_kernels.hpp" #include "dpcpp/matrix/batch_struct.hpp" namespace gko { namespace kernels { namespace dpcpp { -/** - * @brief The batch Cg solver namespace. - * - * @ingroup batch_cg - */ namespace batch_cg { -#include "dpcpp/matrix/batch_csr_kernels.hpp.inc" -#include "dpcpp/matrix/batch_dense_kernels.hpp.inc" -#include "dpcpp/matrix/batch_ell_kernels.hpp.inc" #include "dpcpp/solver/batch_cg_kernels.hpp.inc" diff --git a/dpcpp/solver/batch_cg_kernels.hpp.inc b/dpcpp/solver/batch_cg_kernels.hpp.inc index 7a91bcb2bbf..b233b7df680 100644 --- a/dpcpp/solver/batch_cg_kernels.hpp.inc +++ b/dpcpp/solver/batch_cg_kernels.hpp.inc @@ -27,9 +27,9 @@ __dpct_inline__ void initialize( item_ct1.barrier(sycl::access::fence_space::global_and_local); // r = b - A*x - advanced_apply_kernel(static_cast(-1.0), mat_global_entry, - x_shared_entry, static_cast(1.0), - r_shared_entry, item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply( + static_cast(-1.0), mat_global_entry, x_shared_entry, + static_cast(1.0), r_shared_entry, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); @@ -207,7 +207,8 @@ __dpct_inline__ void apply_kernel( break; } // Ap = A * p - simple_apply_kernel(mat_global_entry, p_sh, Ap_sh, item_ct1); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply( + mat_global_entry, p_sh, Ap_sh, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); // alpha = rho_old / (p' * Ap) From 927a35f1be3c93aca03f526ef7b60d3939d74999 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 22 Aug 2024 18:03:28 +0200 Subject: [PATCH 147/448] [hip, cuda] remove unnecessary .hip.cpp/.cu files --- cuda/matrix/batch_dense_kernels.cu | 56 -------------------------- cuda/matrix/batch_ell_kernels.cu | 55 ------------------------- hip/matrix/batch_csr_kernels.hip.cpp | 55 ------------------------- hip/matrix/batch_dense_kernels.hip.cpp | 56 -------------------------- hip/matrix/batch_ell_kernels.hip.cpp | 55 ------------------------- 5 files changed, 277 deletions(-) delete mode 100644 cuda/matrix/batch_dense_kernels.cu delete mode 100644 cuda/matrix/batch_ell_kernels.cu delete mode 100644 hip/matrix/batch_csr_kernels.hip.cpp delete mode 100644 hip/matrix/batch_dense_kernels.hip.cpp delete mode 100644 hip/matrix/batch_ell_kernels.hip.cpp diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu deleted file mode 100644 index 10148ee242b..00000000000 --- a/cuda/matrix/batch_dense_kernels.cu +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_dense_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Dense matrix format namespace. - * - * @ingroup batch_dense - */ -namespace batch_dense { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" - - -// clang-format on - - -} // namespace batch_dense -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu deleted file mode 100644 index 25281cf6f81..00000000000 --- a/cuda/matrix/batch_ell_kernels.cu +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_ell_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Ell matrix format namespace. - * @ref Ell - * @ingroup batch_ell - */ -namespace batch_ell { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_ell -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp deleted file mode 100644 index b77b9416505..00000000000 --- a/hip/matrix/batch_csr_kernels.hip.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_csr_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Csr matrix format namespace. - * @ref Csr - * @ingroup batch_csr - */ -namespace batch_csr { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_csr -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp deleted file mode 100644 index 67dfd78e264..00000000000 --- a/hip/matrix/batch_dense_kernels.hip.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_dense_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Dense matrix format namespace. - * - * @ingroup batch_dense - */ -namespace batch_dense { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" - - -// clang-format on - - -} // namespace batch_dense -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp deleted file mode 100644 index 68b59c042f1..00000000000 --- a/hip/matrix/batch_ell_kernels.hip.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_ell_kernels.hpp" - -#include - -#include -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Ell matrix format namespace. - * @ref Ell - * @ingroup batch_ell - */ -namespace batch_ell { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_ell -} // namespace hip -} // namespace kernels -} // namespace gko From 2283e78adef133cf7230c9fee3e18b246c0d8929 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 22 Aug 2024 18:20:36 +0200 Subject: [PATCH 148/448] fixup! [dpcpp] unify dpcpp kernels --- dpcpp/matrix/batch_dense_kernels.dp.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp index 2cebbe326e8..43974589abb 100644 --- a/dpcpp/matrix/batch_dense_kernels.dp.cpp +++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp @@ -70,8 +70,8 @@ void simple_apply(std::shared_ptr exec, batch::matrix::extract_batch_item(mat_ub, group_id); const auto b_b = batch::extract_batch_item(b_ub, group_id); const auto x_b = batch::extract_batch_item(x_ub, group_id); - batch_single_kernels::simple_apply_kernel( - mat_b, b_b.values, x_b.values, item_ct1); + batch_single_kernels::simple_apply(mat_b, b_b.values, + x_b.values, item_ct1); }); }); } From bd73597b5af439ed4d2544b4b9a9c14fa4787081 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 23 Aug 2024 13:14:36 +0200 Subject: [PATCH 149/448] [cuda, hip] unify batch_struct headers --- .../base/batch_multi_vector_kernels.hpp | 9 +- .../cuda_hip}/base/batch_struct.hpp | 19 +-- common/cuda_hip/base/types.hpp | 4 + common/cuda_hip/matrix/batch_csr_kernels.hpp | 12 +- .../cuda_hip/matrix/batch_dense_kernels.hpp | 12 +- common/cuda_hip/matrix/batch_ell_kernels.hpp | 12 +- .../cuda_hip}/matrix/batch_struct.hpp | 46 +++--- core/solver/batch_dispatch.hpp | 8 +- cuda/preconditioner/batch_jacobi_kernels.cu | 4 +- cuda/solver/batch_bicgstab_kernels.cu | 4 +- cuda/solver/batch_cg_kernels.cu | 4 +- hip/base/batch_struct.hip.hpp | 64 -------- hip/matrix/batch_struct.hip.hpp | 142 ------------------ .../batch_jacobi_kernels.hip.cpp | 4 +- hip/solver/batch_bicgstab_kernels.hip.cpp | 4 +- hip/solver/batch_cg_kernels.hip.cpp | 4 +- 16 files changed, 58 insertions(+), 294 deletions(-) rename {cuda => common/cuda_hip}/base/batch_struct.hpp (71%) rename {cuda => common/cuda_hip}/matrix/batch_struct.hpp (75%) delete mode 100644 hip/base/batch_struct.hip.hpp delete mode 100644 hip/matrix/batch_struct.hip.hpp diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp index 0cbbdf9f5ee..1cd9d6c752b 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp @@ -10,6 +10,7 @@ #include #include +#include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" @@ -22,14 +23,6 @@ #include "common/cuda_hip/components/thread_ids.hpp" #include "common/cuda_hip/components/warp_blas.hpp" -#if defined(GKO_COMPILING_CUDA) -#include "cuda/base/batch_struct.hpp" -#elif defined(GKO_COMPILING_HIP) -#include "hip/base/batch_struct.hip.hpp" -#else -#error "batch struct def missing" -#endif - namespace gko { namespace kernels { diff --git a/cuda/base/batch_struct.hpp b/common/cuda_hip/base/batch_struct.hpp similarity index 71% rename from cuda/base/batch_struct.hpp rename to common/cuda_hip/base/batch_struct.hpp index 9f07b6b4532..bc10752975f 100644 --- a/cuda/base/batch_struct.hpp +++ b/common/cuda_hip/base/batch_struct.hpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_CUDA_BASE_BATCH_STRUCT_HPP_ -#define GKO_CUDA_BASE_BATCH_STRUCT_HPP_ +#ifndef GKO_COMMON_CUDA_HIP_BASE_BATCH_STRUCT_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_BATCH_STRUCT_HPP_ #include @@ -11,12 +11,13 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" +#include "common/unified/base/kernel_launch.hpp" #include "core/base/batch_struct.hpp" namespace gko { namespace kernels { -namespace cuda { +namespace GKO_DEVICE_NAMESPACE { /** @file batch_struct.hpp @@ -33,10 +34,10 @@ namespace cuda { * Generates an immutable uniform batch struct from a batch of multi-vectors. */ template -inline batch::multi_vector::uniform_batch> +inline batch::multi_vector::uniform_batch> get_batch_struct(const batch::MultiVector* const op) { - return {as_cuda_type(op->get_const_values()), op->get_num_batch_items(), + return {as_device_type(op->get_const_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; @@ -46,19 +47,19 @@ get_batch_struct(const batch::MultiVector* const op) * Generates a uniform batch struct from a batch of multi-vectors. */ template -inline batch::multi_vector::uniform_batch> +inline batch::multi_vector::uniform_batch> get_batch_struct(batch::MultiVector* const op) { - return {as_cuda_type(op->get_values()), op->get_num_batch_items(), + return {as_device_type(op->get_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; } -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko -#endif // GKO_CUDA_BASE_BATCH_STRUCT_HPP_ +#endif // GKO_COMMON_CUDA_HIP_BASE_BATCH_STRUCT_HPP_ diff --git a/common/cuda_hip/base/types.hpp b/common/cuda_hip/base/types.hpp index 08f0516d691..ee1c76a0585 100644 --- a/common/cuda_hip/base/types.hpp +++ b/common/cuda_hip/base/types.hpp @@ -8,8 +8,12 @@ #if defined(GKO_COMPILING_CUDA) #include "cuda/base/types.hpp" +#define device_type cuda_type +#define as_device_type as_cuda_type #elif defined(GKO_COMPILING_HIP) #include "hip/base/types.hip.hpp" +#define device_type hip_type +#define as_device_type as_hip_type #else #error "Executor definition missing" #endif diff --git a/common/cuda_hip/matrix/batch_csr_kernels.hpp b/common/cuda_hip/matrix/batch_csr_kernels.hpp index 32d22e435eb..64611559715 100644 --- a/common/cuda_hip/matrix/batch_csr_kernels.hpp +++ b/common/cuda_hip/matrix/batch_csr_kernels.hpp @@ -11,6 +11,7 @@ #include #include +#include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" @@ -22,16 +23,7 @@ #include "common/cuda_hip/components/segment_scan.hpp" #include "common/cuda_hip/components/thread_ids.hpp" #include "common/cuda_hip/components/warp_blas.hpp" - -#if defined(GKO_COMPILING_CUDA) -#include "cuda/base/batch_struct.hpp" -#include "cuda/matrix/batch_struct.hpp" -#elif defined(GKO_COMPILING_HIP) -#include "hip/base/batch_struct.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" -#else -#error "batch struct def missing" -#endif +#include "common/cuda_hip/matrix/batch_struct.hpp" namespace gko { diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp b/common/cuda_hip/matrix/batch_dense_kernels.hpp index 74b81008b38..e4cd24bbd78 100644 --- a/common/cuda_hip/matrix/batch_dense_kernels.hpp +++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp @@ -11,6 +11,7 @@ #include #include +#include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" @@ -22,16 +23,7 @@ #include "common/cuda_hip/components/segment_scan.hpp" #include "common/cuda_hip/components/thread_ids.hpp" #include "common/cuda_hip/components/warp_blas.hpp" - -#if defined(GKO_COMPILING_CUDA) -#include "cuda/base/batch_struct.hpp" -#include "cuda/matrix/batch_struct.hpp" -#elif defined(GKO_COMPILING_HIP) -#include "hip/base/batch_struct.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" -#else -#error "batch struct def missing" -#endif +#include "common/cuda_hip/matrix/batch_struct.hpp" namespace gko { diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp b/common/cuda_hip/matrix/batch_ell_kernels.hpp index e8cadc29cd3..52826957ddb 100644 --- a/common/cuda_hip/matrix/batch_ell_kernels.hpp +++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp @@ -11,6 +11,7 @@ #include #include +#include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" @@ -22,16 +23,7 @@ #include "common/cuda_hip/components/segment_scan.hpp" #include "common/cuda_hip/components/thread_ids.hpp" #include "common/cuda_hip/components/warp_blas.hpp" - -#if defined(GKO_COMPILING_CUDA) -#include "cuda/base/batch_struct.hpp" -#include "cuda/matrix/batch_struct.hpp" -#elif defined(GKO_COMPILING_HIP) -#include "hip/base/batch_struct.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" -#else -#error "batch struct def missing" -#endif +#include "common/cuda_hip/matrix/batch_struct.hpp" namespace gko { diff --git a/cuda/matrix/batch_struct.hpp b/common/cuda_hip/matrix/batch_struct.hpp similarity index 75% rename from cuda/matrix/batch_struct.hpp rename to common/cuda_hip/matrix/batch_struct.hpp index 8a1b8fee00a..e88eca245bb 100644 --- a/cuda/matrix/batch_struct.hpp +++ b/common/cuda_hip/matrix/batch_struct.hpp @@ -2,35 +2,31 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ -#define GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ +#ifndef GKO_COMMON_CUDA_HIP_MATRIX_BATCH_STRUCT_HPP_ +#define GKO_COMMON_CUDA_HIP_MATRIX_BATCH_STRUCT_HPP_ +#include #include #include #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" +#include "common/unified/base/kernel_launch.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" namespace gko { namespace kernels { -namespace cuda { -namespace { - - -constexpr auto default_block_size = 256; - - -} +namespace GKO_DEVICE_NAMESPACE { /** @file batch_struct.hpp * * Helper functions to generate a batch struct from a batch LinOp, - * while also shallow-casting to the required CUDA scalar type. + * while also shallow-casting to the required GKO_DEVICE_NAMESPACE scalar + * type. * * A specialization is needed for every format of every kind of linear algebra * object. These are intended to be called on the host. @@ -41,11 +37,11 @@ constexpr auto default_block_size = 256; * Generates an immutable uniform batch struct from a batch of csr matrices. */ template -inline batch::matrix::csr::uniform_batch, +inline batch::matrix::csr::uniform_batch, const IndexType> get_batch_struct(const batch::matrix::Csr* const op) { - return {as_cuda_type(op->get_const_values()), + return {as_device_type(op->get_const_values()), op->get_const_col_idxs(), op->get_const_row_ptrs(), op->get_num_batch_items(), @@ -59,10 +55,10 @@ get_batch_struct(const batch::matrix::Csr* const op) * Generates a uniform batch struct from a batch of csr matrices. */ template -inline batch::matrix::csr::uniform_batch, IndexType> +inline batch::matrix::csr::uniform_batch, IndexType> get_batch_struct(batch::matrix::Csr* const op) { - return {as_cuda_type(op->get_values()), + return {as_device_type(op->get_values()), op->get_col_idxs(), op->get_row_ptrs(), op->get_num_batch_items(), @@ -76,10 +72,10 @@ get_batch_struct(batch::matrix::Csr* const op) * Generates an immutable uniform batch struct from a batch of dense matrices. */ template -inline batch::matrix::dense::uniform_batch> +inline batch::matrix::dense::uniform_batch> get_batch_struct(const batch::matrix::Dense* const op) { - return {as_cuda_type(op->get_const_values()), op->get_num_batch_items(), + return {as_device_type(op->get_const_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; @@ -90,10 +86,10 @@ get_batch_struct(const batch::matrix::Dense* const op) * Generates a uniform batch struct from a batch of dense matrices. */ template -inline batch::matrix::dense::uniform_batch> +inline batch::matrix::dense::uniform_batch> get_batch_struct(batch::matrix::Dense* const op) { - return {as_cuda_type(op->get_values()), op->get_num_batch_items(), + return {as_device_type(op->get_values()), op->get_num_batch_items(), static_cast(op->get_common_size()[1]), static_cast(op->get_common_size()[0]), static_cast(op->get_common_size()[1])}; @@ -104,11 +100,11 @@ get_batch_struct(batch::matrix::Dense* const op) * Generates an immutable uniform batch struct from a batch of ell matrices. */ template -inline batch::matrix::ell::uniform_batch, +inline batch::matrix::ell::uniform_batch, const IndexType> get_batch_struct(const batch::matrix::Ell* const op) { - return {as_cuda_type(op->get_const_values()), + return {as_device_type(op->get_const_values()), op->get_const_col_idxs(), op->get_num_batch_items(), static_cast(op->get_common_size()[0]), @@ -122,10 +118,10 @@ get_batch_struct(const batch::matrix::Ell* const op) * Generates a uniform batch struct from a batch of ell matrices. */ template -inline batch::matrix::ell::uniform_batch, IndexType> +inline batch::matrix::ell::uniform_batch, IndexType> get_batch_struct(batch::matrix::Ell* const op) { - return {as_cuda_type(op->get_values()), + return {as_device_type(op->get_values()), op->get_col_idxs(), op->get_num_batch_items(), static_cast(op->get_common_size()[0]), @@ -135,9 +131,9 @@ get_batch_struct(batch::matrix::Ell* const op) } -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko -#endif // GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_ +#endif // GKO_COMMON_CUDA_HIP_MATRIX_BATCH_STRUCT_HPP_ diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp index 8a142a5224a..599c708b334 100644 --- a/core/solver/batch_dispatch.hpp +++ b/core/solver/batch_dispatch.hpp @@ -24,10 +24,10 @@ #if defined GKO_COMPILING_CUDA -#include "cuda/base/batch_struct.hpp" +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" #include "cuda/components/cooperative_groups.cuh" #include "cuda/log/batch_logger.cuh" -#include "cuda/matrix/batch_struct.hpp" #include "cuda/preconditioner/batch_preconditioners.cuh" #include "cuda/stop/batch_criteria.cuh" @@ -52,10 +52,10 @@ using DeviceValueType = typename gko::kernels::cuda::cuda_type; #elif defined GKO_COMPILING_HIP -#include "hip/base/batch_struct.hip.hpp" +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" #include "hip/components/cooperative_groups.hip.hpp" #include "hip/log/batch_logger.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" #include "hip/preconditioner/batch_preconditioners.hip.hpp" #include "hip/stop/batch_criteria.hip.hpp" diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu index 716c158ffff..edf052cb649 100644 --- a/cuda/preconditioner/batch_jacobi_kernels.cu +++ b/cuda/preconditioner/batch_jacobi_kernels.cu @@ -8,19 +8,19 @@ #include #include +#include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/components/intrinsics.hpp" #include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" #include "core/base/batch_struct.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/batch_struct.hpp" #include "core/preconditioner/batch_jacobi_helpers.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/batch_struct.hpp" #include "cuda/base/config.hpp" #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" -#include "cuda/matrix/batch_struct.hpp" // generated header #include "common/cuda_hip/preconditioner/jacobi_common.hpp" diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 09e737c8793..35d567fd911 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -11,6 +11,7 @@ #include #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" +#include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/thrust.hpp" @@ -22,11 +23,10 @@ #include "common/cuda_hip/matrix/batch_csr_kernels.hpp" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/matrix/batch_struct.hpp" namespace gko { diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index 7ac876de3a2..f26f2d37313 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -11,6 +11,7 @@ #include #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" +#include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" @@ -21,11 +22,10 @@ #include "common/cuda_hip/matrix/batch_csr_kernels.hpp" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/matrix/batch_struct.hpp" namespace gko { diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp deleted file mode 100644 index 3e4cba6a747..00000000000 --- a/hip/base/batch_struct.hip.hpp +++ /dev/null @@ -1,64 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_BASE_BATCH_STRUCT_HIP_HPP_ -#define GKO_HIP_BASE_BATCH_STRUCT_HIP_HPP_ - - -#include -#include - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "core/base/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -/** @file batch_struct.hpp - * - * Helper functions to generate a batch struct from a batch LinOp, - * while also shallow-casting to the required Hip scalar type. - * - * A specialization is needed for every format of every kind of linear algebra - * object. These are intended to be called on the host. - */ - - -/** - * Generates an immutable uniform batch struct from a batch of multi-vectors. - */ -template -inline batch::multi_vector::uniform_batch> -get_batch_struct(const batch::MultiVector* const op) -{ - return {as_hip_type(op->get_const_values()), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; -} - -/** - * Generates a uniform batch struct from a batch of multi-vectors. - */ -template -inline batch::multi_vector::uniform_batch> get_batch_struct( - batch::MultiVector* const op) -{ - return {as_hip_type(op->get_values()), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; -} - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_BASE_BATCH_STRUCT_HIP_HPP_ diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp deleted file mode 100644 index a8d14b84bb7..00000000000 --- a/hip/matrix/batch_struct.hip.hpp +++ /dev/null @@ -1,142 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_ -#define GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_ - - -#include -#include - -#include "common/cuda_hip/base/types.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace { - - -constexpr auto default_block_size = 256; - - -} - - -/** @file batch_struct.hpp - * - * Helper functions to generate a batch struct from a batch LinOp, - * while also shallow-casting to the required HIP scalar type. - * - * A specialization is needed for every format of every kind of linear algebra - * object. These are intended to be called on the host. - */ - - -/** - * Generates an immutable uniform batch struct from a batch of csr matrices. - */ -template -inline batch::matrix::csr::uniform_batch, - const IndexType> -get_batch_struct(const batch::matrix::Csr* const op) -{ - return {as_hip_type(op->get_const_values()), - op->get_const_col_idxs(), - op->get_const_row_ptrs(), - op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_elements_per_item())}; -} - - -/** - * Generates a uniform batch struct from a batch of csr matrices. - */ -template -inline batch::matrix::csr::uniform_batch, IndexType> -get_batch_struct(batch::matrix::Csr* const op) -{ - return {as_hip_type(op->get_values()), - op->get_col_idxs(), - op->get_row_ptrs(), - op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_elements_per_item())}; -} - - -/** - * Generates an immutable uniform batch struct from a batch of dense matrices. - */ -template -inline batch::matrix::dense::uniform_batch> -get_batch_struct(const batch::matrix::Dense* const op) -{ - return {as_hip_type(op->get_const_values()), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; -} - - -/** - * Generates a uniform batch struct from a batch of dense matrices. - */ -template -inline batch::matrix::dense::uniform_batch> -get_batch_struct(batch::matrix::Dense* const op) -{ - return {as_hip_type(op->get_values()), op->get_num_batch_items(), - static_cast(op->get_common_size()[1]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1])}; -} - - -/** - * Generates an immutable uniform batch struct from a batch of ell matrices. - */ -template -inline batch::matrix::ell::uniform_batch, - const IndexType> -get_batch_struct(const batch::matrix::Ell* const op) -{ - return {as_hip_type(op->get_const_values()), - op->get_const_col_idxs(), - op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; -} - - -/** - * Generates a uniform batch struct from a batch of ell matrices. - */ -template -inline batch::matrix::ell::uniform_batch, IndexType> -get_batch_struct(batch::matrix::Ell* const op) -{ - return {as_hip_type(op->get_values()), - op->get_col_idxs(), - op->get_num_batch_items(), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[0]), - static_cast(op->get_common_size()[1]), - static_cast(op->get_num_stored_elements_per_row())}; -} - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_ diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp index e86bc86390a..38a81972e66 100644 --- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp +++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp @@ -8,21 +8,21 @@ #include #include +#include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/components/diagonal_block_manipulation.hpp" #include "common/cuda_hip/components/thread_ids.hpp" #include "common/cuda_hip/components/uninitialized_array.hpp" #include "common/cuda_hip/components/warp_blas.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" #include "core/base/batch_struct.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/batch_struct.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/batch_struct.hip.hpp" #include "hip/base/config.hip.hpp" #include "hip/base/types.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" // generated header #include "common/cuda_hip/preconditioner/jacobi_common.hpp" diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index f0f1a715a86..a5de10953bc 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -11,6 +11,7 @@ #include #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" +#include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" @@ -23,11 +24,10 @@ #include "common/cuda_hip/matrix/batch_csr_kernels.hpp" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" namespace gko { diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index b40732535f4..23bb939ead8 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -11,6 +11,7 @@ #include #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" +#include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" @@ -23,11 +24,10 @@ #include "common/cuda_hip/matrix/batch_csr_kernels.hpp" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" namespace gko { From 3ffba6a713eefcc61068bb642ab75345e31d7a4b Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 23 Aug 2024 14:47:10 +0200 Subject: [PATCH 150/448] [cuda, hip] rem anon namespace, type defs --- common/cuda_hip/base/batch_multi_vector_kernels.hpp | 4 ---- common/cuda_hip/base/types.hpp | 4 ---- 2 files changed, 8 deletions(-) diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp index 1cd9d6c752b..7583cc72292 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp @@ -28,15 +28,11 @@ namespace gko { namespace kernels { namespace GKO_DEVICE_NAMESPACE { namespace batch_single_kernels { -namespace { constexpr auto default_block_size = 256; -} - - template __device__ __forceinline__ void scale( const gko::batch::multi_vector::batch_item& alpha, diff --git a/common/cuda_hip/base/types.hpp b/common/cuda_hip/base/types.hpp index ee1c76a0585..08f0516d691 100644 --- a/common/cuda_hip/base/types.hpp +++ b/common/cuda_hip/base/types.hpp @@ -8,12 +8,8 @@ #if defined(GKO_COMPILING_CUDA) #include "cuda/base/types.hpp" -#define device_type cuda_type -#define as_device_type as_cuda_type #elif defined(GKO_COMPILING_HIP) #include "hip/base/types.hip.hpp" -#define device_type hip_type -#define as_device_type as_hip_type #else #error "Executor definition missing" #endif From fdab7d4380eab7c69433bc2de53fef5aa7789bc5 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 23 Aug 2024 16:00:43 +0200 Subject: [PATCH 151/448] [ref] set device namespace with CMake --- reference/CMakeLists.txt | 1 + reference/base/batch_multi_vector_kernels.cpp | 4 ---- reference/matrix/batch_csr_kernels.cpp | 4 ---- reference/matrix/batch_dense_kernels.cpp | 4 ---- reference/matrix/batch_ell_kernels.cpp | 4 ---- 5 files changed, 1 insertion(+), 16 deletions(-) diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index 0c226830637..85b8f33e38b 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -66,6 +66,7 @@ target_sources(ginkgo_reference stop/residual_norm_kernels.cpp) target_link_libraries(ginkgo_reference PUBLIC ginkgo_device) +target_compile_definitions(ginkgo_reference PRIVATE GKO_COMPILING_REFERENCE GKO_DEVICE_NAMESPACE=reference) ginkgo_compile_features(ginkgo_reference) ginkgo_default_includes(ginkgo_reference) ginkgo_install_library(ginkgo_reference) diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp index f5e1c653054..d7fbf3ce214 100644 --- a/reference/base/batch_multi_vector_kernels.cpp +++ b/reference/base/batch_multi_vector_kernels.cpp @@ -10,10 +10,6 @@ #include #include - -#define GKO_DEVICE_NAMESPACE reference - - #include "core/base/batch_struct.hpp" #include "reference/base/batch_multi_vector_kernels.hpp" #include "reference/base/batch_struct.hpp" diff --git a/reference/matrix/batch_csr_kernels.cpp b/reference/matrix/batch_csr_kernels.cpp index 9fbb2e35804..d3304ab9795 100644 --- a/reference/matrix/batch_csr_kernels.cpp +++ b/reference/matrix/batch_csr_kernels.cpp @@ -9,10 +9,6 @@ #include #include - -#define GKO_DEVICE_NAMESPACE reference - - #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" diff --git a/reference/matrix/batch_dense_kernels.cpp b/reference/matrix/batch_dense_kernels.cpp index 99a7d4e8d7b..599af30ecfb 100644 --- a/reference/matrix/batch_dense_kernels.cpp +++ b/reference/matrix/batch_dense_kernels.cpp @@ -9,10 +9,6 @@ #include #include - -#define GKO_DEVICE_NAMESPACE reference - - #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" diff --git a/reference/matrix/batch_ell_kernels.cpp b/reference/matrix/batch_ell_kernels.cpp index 7772662b216..1a4855f389f 100644 --- a/reference/matrix/batch_ell_kernels.cpp +++ b/reference/matrix/batch_ell_kernels.cpp @@ -9,10 +9,6 @@ #include #include - -#define GKO_DEVICE_NAMESPACE reference - - #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_struct.hpp" From d0a7f4a8f99a7e18eea49ae2f3051fdc39ecb297 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 23 Aug 2024 16:31:57 +0200 Subject: [PATCH 152/448] [unified] rem device_namespace defines in source --- common/unified/base/kernel_launch.hpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp index 73d37eb2ac2..455d3d67a6d 100644 --- a/common/unified/base/kernel_launch.hpp +++ b/common/unified/base/kernel_launch.hpp @@ -16,7 +16,6 @@ #if defined(GKO_COMPILING_CUDA) -#define GKO_DEVICE_NAMESPACE cuda #define GKO_KERNEL __device__ #include "common/cuda_hip/base/types.hpp" @@ -43,7 +42,6 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type unpack_member(T value) #elif defined(GKO_COMPILING_HIP) -#define GKO_DEVICE_NAMESPACE hip #define GKO_KERNEL __device__ #include "common/cuda_hip/base/types.hpp" @@ -70,7 +68,6 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type unpack_member(T value) #elif defined(GKO_COMPILING_DPCPP) -#define GKO_DEVICE_NAMESPACE dpcpp #define GKO_KERNEL @@ -105,7 +102,6 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type unpack_member(T value) #elif defined(GKO_COMPILING_OMP) -#define GKO_DEVICE_NAMESPACE omp #define GKO_KERNEL From b2069d75db5c1d8e9ccff6128304f8ac20b37108 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Sat, 15 Jun 2024 22:12:41 +0200 Subject: [PATCH 153/448] add schwarz config whose global index from file --- core/CMakeLists.txt | 3 ++ core/config/config_helper.hpp | 3 +- core/config/registry.cpp | 13 ++++- core/config/schwarz_config.cpp | 54 +++++++++++++++++++ core/distributed/preconditioner/schwarz.cpp | 26 +++++++++ .../distributed/preconditioner/schwarz.hpp | 27 +++++++++- 6 files changed, 121 insertions(+), 5 deletions(-) create mode 100644 core/config/schwarz_config.cpp diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index df8f748b4d3..8c802b2eca5 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -7,6 +7,9 @@ set(config_source config/registry.cpp config/solver_config.cpp ) +if(GINKGO_BUILD_MPI) + list(APPEND config_source config/schwarz_config.cpp) +endif() # MSVC: To solve LNK1189, we separate the library as a workaround # To make ginkgo still be the major library, we make the original to ginkgo_core in MSVC/shared # TODO: should think another way to solve it like dllexport or def file diff --git a/core/config/config_helper.hpp b/core/config/config_helper.hpp index f84e6799bf7..555bb75c2a8 100644 --- a/core/config/config_helper.hpp +++ b/core/config/config_helper.hpp @@ -65,7 +65,8 @@ enum class LinOpFactoryType : int { Isai, Jacobi, Multigrid, - Pgm + Pgm, + Schwarz }; diff --git a/core/config/registry.cpp b/core/config/registry.cpp index 1718de5fed2..188c34b35dd 100644 --- a/core/config/registry.cpp +++ b/core/config/registry.cpp @@ -4,6 +4,7 @@ #include "ginkgo/core/config/registry.hpp" +#include #include #include @@ -16,7 +17,9 @@ namespace config { configuration_map generate_config_map() { - return {{"solver::Cg", parse}, + return + { + {"solver::Cg", parse}, {"solver::Bicg", parse}, {"solver::Bicgstab", parse}, {"solver::Fcg", parse}, @@ -42,7 +45,13 @@ configuration_map generate_config_map() {"preconditioner::Isai", parse}, {"preconditioner::Jacobi", parse}, {"solver::Multigrid", parse}, - {"multigrid::Pgm", parse}}; + {"multigrid::Pgm", parse}, +#if GINKGO_BUILD_MPI + { + "preconditioner::Schwarz", parse + } +#endif + }; } diff --git a/core/config/schwarz_config.cpp b/core/config/schwarz_config.cpp new file mode 100644 index 00000000000..dea907dae08 --- /dev/null +++ b/core/config/schwarz_config.cpp @@ -0,0 +1,54 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include +#include +#include + +#include "core/config/config_helper.hpp" +#include "core/config/dispatch.hpp" +#include "core/config/type_descriptor_helper.hpp" + + +namespace gko { +namespace config { + + +template <> +deferred_factory_parameter parse( + const pnode& config, const registry& context, const type_descriptor& td) +{ + auto updated = update_type(config, td); + auto global_index_str = updated.get_index_typestr(); + if (auto& obj = config.get("global_index_type")) { + global_index_str = obj.get_string(); + } + // We can not directly dispatch the global index type without consider local + // index type, which leadw the invalid index type in + // compile time. + if (updated.get_index_typestr() == type_string::str()) { + return dispatch< + gko::LinOpFactory, + gko::experimental::distributed::preconditioner::Schwarz>( + config, context, updated, + make_type_selector(updated.get_value_typestr(), value_type_list()), + make_type_selector(updated.get_index_typestr(), + syn::type_list()), + make_type_selector(global_index_str, index_type_list())); + } else { + return dispatch< + gko::LinOpFactory, + gko::experimental::distributed::preconditioner::Schwarz>( + config, context, updated, + make_type_selector(updated.get_value_typestr(), value_type_list()), + make_type_selector(updated.get_index_typestr(), + syn::type_list()), + make_type_selector(global_index_str, syn::type_list())); + } +} + + +} // namespace config +} // namespace gko diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp index 7235038847d..d5466cd003a 100644 --- a/core/distributed/preconditioner/schwarz.cpp +++ b/core/distributed/preconditioner/schwarz.cpp @@ -12,11 +12,15 @@ #include #include #include +#include +#include #include #include #include #include "core/base/utils.hpp" +#include "core/config/config_helper.hpp" +#include "core/config/dispatch.hpp" #include "core/distributed/helpers.hpp" @@ -26,6 +30,28 @@ namespace distributed { namespace preconditioner { +template +typename Schwarz::parameters_type +Schwarz::parse( + const config::pnode& config, const config::registry& context, + const config::type_descriptor& td_for_child) +{ + auto params = Schwarz::build(); + + if (auto& obj = config.get("generated_local_solver")) { + params.with_generated_local_solver( + gko::config::get_stored_obj(obj, context)); + } + if (auto& obj = config.get("local_solver")) { + params.with_local_solver( + gko::config::parse_or_get_factory( + obj, context, td_for_child)); + } + + return params; +} + + template void Schwarz::apply_impl( const LinOp* b, LinOp* x) const diff --git a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp index badd5ba7dd3..a8eca306964 100644 --- a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp +++ b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp @@ -14,6 +14,9 @@ #include #include +#include +#include +#include #include #include @@ -39,8 +42,9 @@ namespace preconditioner { * * @note Currently overlap and coarse grid correction are not supported (TODO). * - * @tparam ValueType precision of matrix elements - * @tparam IndexType integral type of the preconditioner + * @tparam ValueType precision of matrix element + * @tparam LocalIndexType local integer type of the matrix + * @tparam GlobalIndexType global integer type of the matrix * * @ingroup schwarz * @ingroup precond @@ -78,6 +82,25 @@ class Schwarz GKO_ENABLE_LIN_OP_FACTORY(Schwarz, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); + /** + * Create the parameters from the property_tree. + * Because this is directly tied to the specific type, the value/index type + * settings within config are ignored and type_descriptor is only used + * for children objects. + * + * @param config the property tree for setting + * @param context the registry + * @param td_for_child the type descriptor for children objects. The + * default uses the value/local index type of this + * class. + * + * @return parameters + */ + static parameters_type parse( + const config::pnode& config, const config::registry& context, + const config::type_descriptor& td_for_child = + config::make_type_descriptor()); + protected: /** * Creates an empty Schwarz preconditioner. From e1a3341f19349410b035163caa7a633a45347c34 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 31 Jul 2024 00:21:27 +0200 Subject: [PATCH 154/448] only set the global index via type descriptor --- core/config/schwarz_config.cpp | 12 +-- core/config/type_descriptor.cpp | 101 ++++++++++++++---- core/test/config/type_descriptor.cpp | 17 ++- .../ginkgo/core/config/type_descriptor.hpp | 20 +++- .../distributed/preconditioner/schwarz.hpp | 7 +- 5 files changed, 121 insertions(+), 36 deletions(-) diff --git a/core/config/schwarz_config.cpp b/core/config/schwarz_config.cpp index dea907dae08..9543b833041 100644 --- a/core/config/schwarz_config.cpp +++ b/core/config/schwarz_config.cpp @@ -21,12 +21,8 @@ deferred_factory_parameter parse( const pnode& config, const registry& context, const type_descriptor& td) { auto updated = update_type(config, td); - auto global_index_str = updated.get_index_typestr(); - if (auto& obj = config.get("global_index_type")) { - global_index_str = obj.get_string(); - } // We can not directly dispatch the global index type without consider local - // index type, which leadw the invalid index type in + // index type, which leads the invalid index type in // compile time. if (updated.get_index_typestr() == type_string::str()) { return dispatch< @@ -36,7 +32,8 @@ deferred_factory_parameter parse( make_type_selector(updated.get_value_typestr(), value_type_list()), make_type_selector(updated.get_index_typestr(), syn::type_list()), - make_type_selector(global_index_str, index_type_list())); + make_type_selector(updated.get_global_index_typestr(), + index_type_list())); } else { return dispatch< gko::LinOpFactory, @@ -45,7 +42,8 @@ deferred_factory_parameter parse( make_type_selector(updated.get_value_typestr(), value_type_list()), make_type_selector(updated.get_index_typestr(), syn::type_list()), - make_type_selector(global_index_str, syn::type_list())); + make_type_selector(updated.get_global_index_typestr(), + syn::type_list())); } } diff --git a/core/config/type_descriptor.cpp b/core/config/type_descriptor.cpp index 93ec1d3f929..017cc98ca06 100644 --- a/core/config/type_descriptor.cpp +++ b/core/config/type_descriptor.cpp @@ -17,6 +17,7 @@ type_descriptor update_type(const pnode& config, const type_descriptor& td) { auto value_typestr = td.get_value_typestr(); auto index_typestr = td.get_index_typestr(); + auto global_index_typestr = td.get_global_index_typestr(); if (auto& obj = config.get("value_type")) { value_typestr = obj.get_string(); @@ -26,37 +27,93 @@ type_descriptor update_type(const pnode& config, const type_descriptor& td) "Setting index_type in the config is not allowed. Please set the " "proper index_type through type_descriptor of parse"); } - return type_descriptor{value_typestr, index_typestr}; + if (auto& obj = config.get("global_index_type")) { + GKO_INVALID_STATE( + "Setting global_index_type in the config is not allowed. Please " + "set the proper global_index_type through type_descriptor of " + "parse"); + } + return type_descriptor{value_typestr, index_typestr, global_index_typestr}; } -template +template type_descriptor make_type_descriptor() { return type_descriptor{type_string::str(), - type_string::str()}; + type_string::str(), + type_string::str()}; } -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor, void>(); -template type_descriptor make_type_descriptor, void>(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor, int32>(); -template type_descriptor make_type_descriptor, int32>(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor, int64>(); -template type_descriptor make_type_descriptor, int64>(); +// global_index: void +template type_descriptor make_type_descriptor(); +template type_descriptor make_type_descriptor(); +template type_descriptor make_type_descriptor(); +template type_descriptor +make_type_descriptor, void, void>(); +template type_descriptor +make_type_descriptor, void, void>(); +template type_descriptor make_type_descriptor(); +template type_descriptor make_type_descriptor(); +template type_descriptor make_type_descriptor(); +template type_descriptor +make_type_descriptor, int32, void>(); +template type_descriptor +make_type_descriptor, int32, void>(); +template type_descriptor make_type_descriptor(); +template type_descriptor make_type_descriptor(); +template type_descriptor make_type_descriptor(); +template type_descriptor +make_type_descriptor, int64, void>(); +template type_descriptor +make_type_descriptor, int64, void>(); + +// global_index int32 +template type_descriptor make_type_descriptor(); +template type_descriptor make_type_descriptor(); +template type_descriptor make_type_descriptor(); +template type_descriptor +make_type_descriptor, void, int32>(); +template type_descriptor +make_type_descriptor, void, int32>(); +template type_descriptor make_type_descriptor(); +template type_descriptor make_type_descriptor(); +template type_descriptor make_type_descriptor(); +template type_descriptor +make_type_descriptor, int32, int32>(); +template type_descriptor +make_type_descriptor, int32, int32>(); + +// global_index_type int64 +template type_descriptor make_type_descriptor(); +template type_descriptor make_type_descriptor(); +template type_descriptor make_type_descriptor(); +template type_descriptor +make_type_descriptor, void, int64>(); +template type_descriptor +make_type_descriptor, void, int64>(); +template type_descriptor make_type_descriptor(); +template type_descriptor make_type_descriptor(); +template type_descriptor make_type_descriptor(); +template type_descriptor +make_type_descriptor, int32, int64>(); +template type_descriptor +make_type_descriptor, int32, int64>(); +template type_descriptor make_type_descriptor(); +template type_descriptor make_type_descriptor(); +template type_descriptor make_type_descriptor(); +template type_descriptor +make_type_descriptor, int64, int64>(); +template type_descriptor +make_type_descriptor, int64, int64>(); type_descriptor::type_descriptor(std::string value_typestr, - std::string index_typestr) - : value_typestr_(value_typestr), index_typestr_(index_typestr) + std::string index_typestr, + std::string global_index_typestr) + : value_typestr_(value_typestr), + index_typestr_(index_typestr), + global_index_typestr_(global_index_typestr) {} const std::string& type_descriptor::get_value_typestr() const @@ -69,6 +126,10 @@ const std::string& type_descriptor::get_index_typestr() const return index_typestr_; } +const std::string& type_descriptor::get_global_index_typestr() const +{ + return global_index_typestr_; +} } // namespace config } // namespace gko diff --git a/core/test/config/type_descriptor.cpp b/core/test/config/type_descriptor.cpp index ff519e88101..86505df51d1 100644 --- a/core/test/config/type_descriptor.cpp +++ b/core/test/config/type_descriptor.cpp @@ -21,6 +21,15 @@ TEST(TypeDescriptor, TemplateCreate) ASSERT_EQ(td.get_value_typestr(), "float64"); ASSERT_EQ(td.get_index_typestr(), "int32"); + ASSERT_EQ(td.get_global_index_typestr(), "int64"); + } + { + SCOPED_TRACE("specify global indextype"); + auto td = make_type_descriptor(); + + ASSERT_EQ(td.get_value_typestr(), "float32"); + ASSERT_EQ(td.get_index_typestr(), "int32"); + ASSERT_EQ(td.get_global_index_typestr(), "int"); } { SCOPED_TRACE("specify valuetype"); @@ -28,20 +37,24 @@ TEST(TypeDescriptor, TemplateCreate) ASSERT_EQ(td.get_value_typestr(), "float32"); ASSERT_EQ(td.get_index_typestr(), "int32"); + ASSERT_EQ(td.get_global_index_typestr(), "int64"); } { SCOPED_TRACE("specify all template"); - auto td = make_type_descriptor, gko::int64>(); + auto td = + make_type_descriptor, gko::int64, gko::int64>(); ASSERT_EQ(td.get_value_typestr(), "complex"); ASSERT_EQ(td.get_index_typestr(), "int64"); + ASSERT_EQ(td.get_global_index_typestr(), "int64"); } { SCOPED_TRACE("specify void"); - auto td = make_type_descriptor(); + auto td = make_type_descriptor(); ASSERT_EQ(td.get_value_typestr(), "void"); ASSERT_EQ(td.get_index_typestr(), "void"); + ASSERT_EQ(td.get_global_index_typestr(), "void"); } } diff --git a/include/ginkgo/core/config/type_descriptor.hpp b/include/ginkgo/core/config/type_descriptor.hpp index 48475f7f469..aa75b4591fa 100644 --- a/include/ginkgo/core/config/type_descriptor.hpp +++ b/include/ginkgo/core/config/type_descriptor.hpp @@ -8,6 +8,8 @@ #include +#include + namespace gko { namespace config { @@ -27,10 +29,9 @@ namespace config { * value `void` can be used to specify that no default type is provided. In this * case, the configuration has to provide the necessary template types. * - * If the configuration specifies one of the fields (or both): + * If the configuration specifies one field (only allow value_type now): * ``` * value_type: "some_value_type" - * index_type: "some_index_type" * ``` * these types will take precedence over the type_descriptor. */ @@ -42,12 +43,15 @@ class type_descriptor final { * * @param value_typestr the value type string. "void" means no default. * @param index_typestr the index type string. "void" means no default. + * @param global_index_typestr the global index type string. "void" means + * no default. * * @note there is no way to call the constructor with explicit template, so * we create another free function to handle it. */ explicit type_descriptor(std::string value_typestr = "float64", - std::string index_typestr = "int32"); + std::string index_typestr = "int32", + std::string global_index_typestr = "int64"); /** * Get the value type string. @@ -59,9 +63,15 @@ class type_descriptor final { */ const std::string& get_index_typestr() const; + /** + * Get the global index type string + */ + const std::string& get_global_index_typestr() const; + private: std::string value_typestr_; std::string index_typestr_; + std::string global_index_typestr_; }; @@ -71,8 +81,10 @@ class type_descriptor final { * * @tparam ValueType the value type in descriptor * @tparam IndexType the index type in descriptor + * @tparam GlobalIndexType the global index type in descriptor */ -template +template type_descriptor make_type_descriptor(); diff --git a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp index a8eca306964..adc67dfbd36 100644 --- a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp +++ b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp @@ -91,15 +91,16 @@ class Schwarz * @param config the property tree for setting * @param context the registry * @param td_for_child the type descriptor for children objects. The - * default uses the value/local index type of this - * class. + * default uses the value/local/global index type of + * this class. * * @return parameters */ static parameters_type parse( const config::pnode& config, const config::registry& context, const config::type_descriptor& td_for_child = - config::make_type_descriptor()); + config::make_type_descriptor()); protected: /** From 36adda67e407b590f02e31bd3aa2460941a9ba92 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 1 Aug 2024 16:12:50 +0200 Subject: [PATCH 155/448] add schwarz config test --- core/test/config/CMakeLists.txt | 1 + core/test/config/preconditioner.cpp | 74 +++++++++++++++++++++++++++- core/test/config/type_descriptor.cpp | 2 +- 3 files changed, 75 insertions(+), 2 deletions(-) diff --git a/core/test/config/CMakeLists.txt b/core/test/config/CMakeLists.txt index c3c400cc8b4..4460e2ed916 100644 --- a/core/test/config/CMakeLists.txt +++ b/core/test/config/CMakeLists.txt @@ -5,3 +5,4 @@ ginkgo_create_test(preconditioner) ginkgo_create_test(property_tree) ginkgo_create_test(registry) ginkgo_create_test(solver) +ginkgo_create_test(type_descriptor) diff --git a/core/test/config/preconditioner.cpp b/core/test/config/preconditioner.cpp index b11ea3b6705..9e81e690967 100644 --- a/core/test/config/preconditioner.cpp +++ b/core/test/config/preconditioner.cpp @@ -6,8 +6,11 @@ #include +#include #include #include +#include +#include #include #include #include @@ -297,6 +300,68 @@ struct Jacobi }; +#if GINKGO_BUILD_MPI + + +struct Schwarz + : PreconditionerConfigTest< + ::gko::experimental::distributed::preconditioner::Schwarz, + ::gko::experimental::distributed::preconditioner::Schwarz< + double, int, gko::int64>> { + static pnode::map_type setup_base() + { + return {{"type", pnode{"preconditioner::Schwarz"}}}; + } + + static void change_template(pnode::map_type& config_map) + { + config_map["value_type"] = pnode{"float32"}; + } + + template + static void set(pnode::map_type& config_map, ParamType& param, registry reg, + std::shared_ptr exec) + { + if (from_reg) { + config_map["local_solver"] = pnode{"solver"}; + param.with_local_solver( + detail::registry_accessor::get_data( + reg, "solver")); + } else { + config_map["local_solver"] = + pnode{{{"type", pnode{"solver::Ir"}}, + {"value_type", pnode{"float32"}}}}; + param.with_local_solver(DummyIr::build().on(exec)); + } + config_map["generated_local_solver"] = pnode{"linop"}; + param.with_generated_local_solver( + detail::registry_accessor::get_data(reg, "linop")); + } + + template + static void validate(gko::LinOpFactory* result, AnswerType* answer) + { + auto res_param = gko::as(result)->get_parameters(); + auto ans_param = answer->get_parameters(); + + if (from_reg) { + ASSERT_EQ(res_param.local_solver, ans_param.local_solver); + } else { + ASSERT_NE( + std::dynamic_pointer_cast( + res_param.local_solver), + nullptr); + } + ASSERT_EQ(res_param.generated_local_solver, + ans_param.generated_local_solver); + } +}; + + +#endif // GINKGO_BUILD_MPI + + template class Preconditioner : public ::testing::Test { protected: @@ -309,12 +374,14 @@ class Preconditioner : public ::testing::Test { l_solver(DummyIr::build().on(exec)), u_solver(DummyIr::build().on(exec)), factorization(DummyIr::build().on(exec)), + linop(gko::matrix::Dense<>::create(exec)), reg() { reg.emplace("solver", solver_factory); reg.emplace("l_solver", l_solver); reg.emplace("u_solver", u_solver); reg.emplace("factorization", factorization); + reg.emplace("linop", linop); } std::shared_ptr exec; @@ -323,11 +390,16 @@ class Preconditioner : public ::testing::Test { std::shared_ptr l_solver; std::shared_ptr u_solver; std::shared_ptr factorization; + std::shared_ptr linop; registry reg; }; -using PreconditionerTypes = ::testing::Types<::Ic, ::Ilu, ::Isai, ::Jacobi>; +using PreconditionerTypes = ::testing::Types< +#if GINKGO_BUILD_MPI + ::Schwarz, +#endif // GINKGO_BUILD_MPI + ::Ic, ::Ilu, ::Isai, ::Jacobi>; TYPED_TEST_SUITE(Preconditioner, PreconditionerTypes, TypenameNameGenerator); diff --git a/core/test/config/type_descriptor.cpp b/core/test/config/type_descriptor.cpp index 86505df51d1..f044d60716f 100644 --- a/core/test/config/type_descriptor.cpp +++ b/core/test/config/type_descriptor.cpp @@ -29,7 +29,7 @@ TEST(TypeDescriptor, TemplateCreate) ASSERT_EQ(td.get_value_typestr(), "float32"); ASSERT_EQ(td.get_index_typestr(), "int32"); - ASSERT_EQ(td.get_global_index_typestr(), "int"); + ASSERT_EQ(td.get_global_index_typestr(), "int32"); } { SCOPED_TRACE("specify valuetype"); From 4761e53c7e9bad9803fa24a158e11901c06833ff Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 23 Aug 2024 18:05:21 +0200 Subject: [PATCH 156/448] update documentation and use macro Co-authored-by: Marcel Koch Co-authored-by: Pratik Nayak --- core/config/type_descriptor.cpp | 79 +++++-------------- core/test/config/type_descriptor.cpp | 40 ++++++++-- .../ginkgo/core/config/type_descriptor.hpp | 22 ++++-- 3 files changed, 67 insertions(+), 74 deletions(-) diff --git a/core/config/type_descriptor.cpp b/core/config/type_descriptor.cpp index 017cc98ca06..fe11b785d6f 100644 --- a/core/config/type_descriptor.cpp +++ b/core/config/type_descriptor.cpp @@ -5,6 +5,7 @@ #include "ginkgo/core/config/type_descriptor.hpp" #include +#include #include "core/config/type_descriptor_helper.hpp" @@ -45,67 +46,18 @@ type_descriptor make_type_descriptor() type_string::str()}; } -// global_index: void -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor -make_type_descriptor, void, void>(); -template type_descriptor -make_type_descriptor, void, void>(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor -make_type_descriptor, int32, void>(); -template type_descriptor -make_type_descriptor, int32, void>(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor -make_type_descriptor, int64, void>(); -template type_descriptor -make_type_descriptor, int64, void>(); - -// global_index int32 -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor -make_type_descriptor, void, int32>(); -template type_descriptor -make_type_descriptor, void, int32>(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor -make_type_descriptor, int32, int32>(); -template type_descriptor -make_type_descriptor, int32, int32>(); - -// global_index_type int64 -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor -make_type_descriptor, void, int64>(); -template type_descriptor -make_type_descriptor, void, int64>(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor -make_type_descriptor, int32, int64>(); -template type_descriptor -make_type_descriptor, int32, int64>(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor make_type_descriptor(); -template type_descriptor -make_type_descriptor, int64, int64>(); -template type_descriptor -make_type_descriptor, int64, int64>(); +#define GKO_DECLARE_MAKE_TYPE_DESCRIPTOR(ValueType, LocalIndexType, \ + GlobalIndexType) \ + type_descriptor \ + make_type_descriptor() +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_MAKE_TYPE_DESCRIPTOR); + +#define GKO_DECLARE_MAKE_VOID_TYPE_DESCRIPTOR(LocalIndexType, GlobalIndexType) \ + type_descriptor \ + make_type_descriptor() +GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_MAKE_VOID_TYPE_DESCRIPTOR); type_descriptor::type_descriptor(std::string value_typestr, @@ -126,6 +78,11 @@ const std::string& type_descriptor::get_index_typestr() const return index_typestr_; } +const std::string& type_descriptor::get_local_index_typestr() const +{ + return this->get_index_typestr(); +} + const std::string& type_descriptor::get_global_index_typestr() const { return global_index_typestr_; diff --git a/core/test/config/type_descriptor.cpp b/core/test/config/type_descriptor.cpp index f044d60716f..e8a7327a6a2 100644 --- a/core/test/config/type_descriptor.cpp +++ b/core/test/config/type_descriptor.cpp @@ -21,6 +21,7 @@ TEST(TypeDescriptor, TemplateCreate) ASSERT_EQ(td.get_value_typestr(), "float64"); ASSERT_EQ(td.get_index_typestr(), "int32"); + ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr()); ASSERT_EQ(td.get_global_index_typestr(), "int64"); } { @@ -29,6 +30,7 @@ TEST(TypeDescriptor, TemplateCreate) ASSERT_EQ(td.get_value_typestr(), "float32"); ASSERT_EQ(td.get_index_typestr(), "int32"); + ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr()); ASSERT_EQ(td.get_global_index_typestr(), "int32"); } { @@ -37,24 +39,37 @@ TEST(TypeDescriptor, TemplateCreate) ASSERT_EQ(td.get_value_typestr(), "float32"); ASSERT_EQ(td.get_index_typestr(), "int32"); + ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr()); ASSERT_EQ(td.get_global_index_typestr(), "int64"); } { - SCOPED_TRACE("specify all template"); + SCOPED_TRACE("specify local index template"); auto td = make_type_descriptor, gko::int64, gko::int64>(); ASSERT_EQ(td.get_value_typestr(), "complex"); ASSERT_EQ(td.get_index_typestr(), "int64"); + ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr()); ASSERT_EQ(td.get_global_index_typestr(), "int64"); } + { + SCOPED_TRACE("specify global index template"); + auto td = + make_type_descriptor, gko::int32, gko::int32>(); + + ASSERT_EQ(td.get_value_typestr(), "complex"); + ASSERT_EQ(td.get_index_typestr(), "int32"); + ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr()); + ASSERT_EQ(td.get_global_index_typestr(), "int32"); + } { SCOPED_TRACE("specify void"); - auto td = make_type_descriptor(); + auto td = make_type_descriptor(); ASSERT_EQ(td.get_value_typestr(), "void"); - ASSERT_EQ(td.get_index_typestr(), "void"); - ASSERT_EQ(td.get_global_index_typestr(), "void"); + ASSERT_EQ(td.get_index_typestr(), "int32"); + ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr()); + ASSERT_EQ(td.get_global_index_typestr(), "int64"); } } @@ -67,6 +82,8 @@ TEST(TypeDescriptor, Constructor) ASSERT_EQ(td.get_value_typestr(), "float64"); ASSERT_EQ(td.get_index_typestr(), "int32"); + ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr()); + ASSERT_EQ(td.get_global_index_typestr(), "int64"); } { SCOPED_TRACE("specify valuetype"); @@ -74,12 +91,25 @@ TEST(TypeDescriptor, Constructor) ASSERT_EQ(td.get_value_typestr(), "float32"); ASSERT_EQ(td.get_index_typestr(), "int32"); + ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr()); + ASSERT_EQ(td.get_global_index_typestr(), "int64"); } { - SCOPED_TRACE("specify all parameters"); + SCOPED_TRACE("specify local index parameters"); type_descriptor td("void", "int64"); ASSERT_EQ(td.get_value_typestr(), "void"); ASSERT_EQ(td.get_index_typestr(), "int64"); + ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr()); + ASSERT_EQ(td.get_global_index_typestr(), "int64"); + } + { + SCOPED_TRACE("specify global index parameters"); + type_descriptor td("void", "int32", "int32"); + + ASSERT_EQ(td.get_value_typestr(), "void"); + ASSERT_EQ(td.get_index_typestr(), "int32"); + ASSERT_EQ(td.get_local_index_typestr(), td.get_index_typestr()); + ASSERT_EQ(td.get_global_index_typestr(), "int32"); } } diff --git a/include/ginkgo/core/config/type_descriptor.hpp b/include/ginkgo/core/config/type_descriptor.hpp index aa75b4591fa..5c719340436 100644 --- a/include/ginkgo/core/config/type_descriptor.hpp +++ b/include/ginkgo/core/config/type_descriptor.hpp @@ -25,15 +25,16 @@ namespace config { * auto cg = parse(config, context, type_descriptor("float64", "int32")); * ``` * will have the value type `float64` and the index type `int32`. Any Ginkgo - * object that does not require one of these types will just ignore it. The - * value `void` can be used to specify that no default type is provided. In this - * case, the configuration has to provide the necessary template types. + * object that does not require one of these types will just ignore it. In + * value_type, one additional value `void` can be used to specify that no + * default type is provided. In this case, the configuration has to provide the + * necessary template types. * * If the configuration specifies one field (only allow value_type now): * ``` * value_type: "some_value_type" * ``` - * these types will take precedence over the type_descriptor. + * this type will take precedence over the type_descriptor. */ class type_descriptor final { public: @@ -42,9 +43,8 @@ class type_descriptor final { * `make_type_descriptor` to create the object by template. * * @param value_typestr the value type string. "void" means no default. - * @param index_typestr the index type string. "void" means no default. - * @param global_index_typestr the global index type string. "void" means - * no default. + * @param index_typestr the (local) index type string. + * @param global_index_typestr the global index type string. * * @note there is no way to call the constructor with explicit template, so * we create another free function to handle it. @@ -63,6 +63,12 @@ class type_descriptor final { */ const std::string& get_index_typestr() const; + /** + * Get the local index type string, which gives the same result as + * get_index_typestr() + */ + const std::string& get_local_index_typestr() const; + /** * Get the global index type string */ @@ -83,7 +89,7 @@ class type_descriptor final { * @tparam IndexType the index type in descriptor * @tparam GlobalIndexType the global index type in descriptor */ -template type_descriptor make_type_descriptor(); From aab84e9fc2e329ab87a3bb3464e1b14d093d9b26 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 21 Aug 2024 15:38:17 +0200 Subject: [PATCH 157/448] remove assertion workaround This causes some kernels on ROCm debug builds to fail --- include/ginkgo/core/base/types.hpp | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 8e2096c09e2..e375da15f9c 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -51,30 +51,8 @@ #endif -#if (defined(__CUDA_ARCH__) && defined(__APPLE__)) || \ - defined(__HIP_DEVICE_COMPILE__) - -#ifdef NDEBUG -#define GKO_ASSERT(condition) ((void)0) -#else // NDEBUG -// Poor man's assertions on GPUs for MACs. They won't terminate the program -// but will at least print something on the screen -#define GKO_ASSERT(condition) \ - ((condition) \ - ? ((void)0) \ - : ((void)printf("%s: %d: %s: Assertion `" #condition "' failed\n", \ - __FILE__, __LINE__, __func__))) -#endif // NDEBUG - -#else // (defined(__CUDA_ARCH__) && defined(__APPLE__)) || - // defined(__HIP_DEVICE_COMPILE__) - -// Handle assertions normally on other systems #define GKO_ASSERT(condition) assert(condition) -#endif // (defined(__CUDA_ARCH__) && defined(__APPLE__)) || - // defined(__HIP_DEVICE_COMPILE__) - // Handle deprecated notices correctly on different systems // clang-format off From 18eb9d7bf3aa750db981fa203b31753f2503bb4d Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 24 Aug 2024 10:47:42 +0200 Subject: [PATCH 158/448] fix ROCm 6.x segfaults on MI50 There is some weird interaction between inlining of shfl_xor and the (otherwise unused) members of thread_block_tile. The easiest way of working around it is to inline them explicitly as __shfl_xor(_sync). --- common/cuda_hip/components/sorting.hpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/common/cuda_hip/components/sorting.hpp b/common/cuda_hip/components/sorting.hpp index 7603d41a8ba..c15c4a70c64 100644 --- a/common/cuda_hip/components/sorting.hpp +++ b/common/cuda_hip/components/sorting.hpp @@ -113,11 +113,15 @@ struct bitonic_warp { __forceinline__ __device__ static void merge(ValueType* els, bool reverse) { - auto tile = - group::tiled_partition(group::this_thread_block()); auto new_reverse = reverse != upper_half(); for (int i = 0; i < num_local; ++i) { - auto other = tile.shfl_xor(els[i], num_threads / 2); + // workaround for ROCm 6.x segfaults on gfx906 +#ifdef GKO_COMPILING_CUDA + auto other = __shfl_xor_sync(config::full_lane_mask, els[i], + num_threads / 2); +#else + auto other = __shfl_xor(els[i], num_threads / 2); +#endif bitonic_cas(els[i], other, new_reverse); } half::merge(els, reverse); From 71cd5eec814fe3c061bdb9cb3c3172e5c014a0c9 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 25 Aug 2024 16:56:20 +0200 Subject: [PATCH 159/448] more precise shuffle bounds --- common/cuda_hip/components/sorting.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/cuda_hip/components/sorting.hpp b/common/cuda_hip/components/sorting.hpp index c15c4a70c64..76694541c2d 100644 --- a/common/cuda_hip/components/sorting.hpp +++ b/common/cuda_hip/components/sorting.hpp @@ -118,9 +118,9 @@ struct bitonic_warp { // workaround for ROCm 6.x segfaults on gfx906 #ifdef GKO_COMPILING_CUDA auto other = __shfl_xor_sync(config::full_lane_mask, els[i], - num_threads / 2); + num_threads / 2, num_threads); #else - auto other = __shfl_xor(els[i], num_threads / 2); + auto other = __shfl_xor(els[i], num_threads / 2, num_threads); #endif bitonic_cas(els[i], other, new_reverse); } From 4a56f71db7e9e9bd339301074efbe3b1eedc1a95 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Fri, 23 Aug 2024 23:59:30 +0200 Subject: [PATCH 160/448] [kernels] fix odr violations --- common/cuda_hip/base/batch_multi_vector_kernels.hpp | 7 +++++++ common/cuda_hip/matrix/batch_csr_kernels.hpp | 7 +++++++ common/cuda_hip/matrix/batch_dense_kernels.hpp | 7 +++++++ common/cuda_hip/matrix/batch_ell_kernels.hpp | 7 +++++++ dpcpp/base/batch_multi_vector_kernels.hpp | 7 +++++++ dpcpp/matrix/batch_csr_kernels.hpp | 7 +++++++ dpcpp/matrix/batch_dense_kernels.hpp | 7 +++++++ dpcpp/matrix/batch_ell_kernels.hpp | 7 +++++++ reference/base/batch_multi_vector_kernels.hpp | 7 +++++++ reference/matrix/batch_csr_kernels.hpp | 7 +++++++ reference/matrix/batch_dense_kernels.hpp | 7 +++++++ reference/matrix/batch_ell_kernels.hpp | 7 +++++++ 12 files changed, 84 insertions(+) diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp index 7583cc72292..3f5763474c2 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp @@ -2,6 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_ + + #include #include @@ -315,3 +319,6 @@ __global__ __launch_bounds__(default_block_size) void copy_kernel( } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko + + +#endif diff --git a/common/cuda_hip/matrix/batch_csr_kernels.hpp b/common/cuda_hip/matrix/batch_csr_kernels.hpp index 64611559715..5ed66c59d14 100644 --- a/common/cuda_hip/matrix/batch_csr_kernels.hpp +++ b/common/cuda_hip/matrix/batch_csr_kernels.hpp @@ -2,6 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_MATRIX_BATCH_CSR_KERNELS_HPP_ +#define GKO_COMMON_CUDA_HIP_MATRIX_BATCH_CSR_KERNELS_HPP_ + + #include #include @@ -200,3 +204,6 @@ __global__ void add_scaled_identity_kernel( } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko + + +#endif diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp b/common/cuda_hip/matrix/batch_dense_kernels.hpp index e4cd24bbd78..7902d6010fa 100644 --- a/common/cuda_hip/matrix/batch_dense_kernels.hpp +++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp @@ -2,6 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_MATRIX_BATCH_DENSE_KERNELS_HPP_ +#define GKO_COMMON_CUDA_HIP_MATRIX_BATCH_DENSE_KERNELS_HPP_ + + #include #include @@ -247,3 +251,6 @@ __global__ void add_scaled_identity_kernel( } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko + + +#endif diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp b/common/cuda_hip/matrix/batch_ell_kernels.hpp index 52826957ddb..f32144dc172 100644 --- a/common/cuda_hip/matrix/batch_ell_kernels.hpp +++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp @@ -2,6 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_MATRIX_BATCH_ELL_KERNELS_HPP_ +#define GKO_COMMON_CUDA_HIP_MATRIX_BATCH_ELL_KERNELS_HPP_ + + #include #include @@ -210,3 +214,6 @@ __global__ void add_scaled_identity_kernel( } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko + + +#endif diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp b/dpcpp/base/batch_multi_vector_kernels.hpp index bbcc540ae60..142eba259de 100644 --- a/dpcpp/base/batch_multi_vector_kernels.hpp +++ b/dpcpp/base/batch_multi_vector_kernels.hpp @@ -2,6 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_DPCPP_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_ +#define GKO_DPCPP_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_ + + #include #include @@ -257,3 +261,6 @@ __dpct_inline__ void copy_kernel( } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko + + +#endif diff --git a/dpcpp/matrix/batch_csr_kernels.hpp b/dpcpp/matrix/batch_csr_kernels.hpp index f51124f81a4..2b195de308b 100644 --- a/dpcpp/matrix/batch_csr_kernels.hpp +++ b/dpcpp/matrix/batch_csr_kernels.hpp @@ -2,6 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_DPCPP_MATRIX_BATCH_CSR_KERNELS_HPP_ +#define GKO_DPCPP_MATRIX_BATCH_CSR_KERNELS_HPP_ + + #include #include @@ -108,3 +112,6 @@ __dpct_inline__ void add_scaled_identity( } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko + + +#endif diff --git a/dpcpp/matrix/batch_dense_kernels.hpp b/dpcpp/matrix/batch_dense_kernels.hpp index acf1e65939d..59aee9a7208 100644 --- a/dpcpp/matrix/batch_dense_kernels.hpp +++ b/dpcpp/matrix/batch_dense_kernels.hpp @@ -2,6 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_DPCPP_MATRIX_BATCH_DENSE_KERNELS_HPP_ +#define GKO_DPCPP_MATRIX_BATCH_DENSE_KERNELS_HPP_ + + #include #include @@ -170,3 +174,6 @@ __dpct_inline__ void add_scaled_identity( } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko + + +#endif diff --git a/dpcpp/matrix/batch_ell_kernels.hpp b/dpcpp/matrix/batch_ell_kernels.hpp index 48ab9318bdf..5a1ba163216 100644 --- a/dpcpp/matrix/batch_ell_kernels.hpp +++ b/dpcpp/matrix/batch_ell_kernels.hpp @@ -2,6 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_DPCPP_MATRIX_BATCH_ELL_KERNELS_HPP_ +#define GKO_DPCPP_MATRIX_BATCH_ELL_KERNELS_HPP_ + + #include #include @@ -119,3 +123,6 @@ __dpct_inline__ void add_scaled_identity( } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko + + +#endif diff --git a/reference/base/batch_multi_vector_kernels.hpp b/reference/base/batch_multi_vector_kernels.hpp index 88f531f29cc..140072fd301 100644 --- a/reference/base/batch_multi_vector_kernels.hpp +++ b/reference/base/batch_multi_vector_kernels.hpp @@ -2,6 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_REFERENCE_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_ +#define GKO_REFERENCE_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_ + + #include #include #include @@ -149,3 +153,6 @@ inline void copy_kernel( } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko + + +#endif diff --git a/reference/matrix/batch_csr_kernels.hpp b/reference/matrix/batch_csr_kernels.hpp index e04b2bdf345..8f1bfe400e3 100644 --- a/reference/matrix/batch_csr_kernels.hpp +++ b/reference/matrix/batch_csr_kernels.hpp @@ -2,6 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_REFERENCE_MATRIX_BATCH_CSR_KERNELS_HPP_ +#define GKO_REFERENCE_MATRIX_BATCH_CSR_KERNELS_HPP_ + + #include #include @@ -99,3 +103,6 @@ inline void add_scaled_identity( } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko + + +#endif diff --git a/reference/matrix/batch_dense_kernels.hpp b/reference/matrix/batch_dense_kernels.hpp index e12827c77de..7fd6a8cdbb5 100644 --- a/reference/matrix/batch_dense_kernels.hpp +++ b/reference/matrix/batch_dense_kernels.hpp @@ -2,6 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_REFERENCE_MATRIX_BATCH_DENSE_KERNELS_HPP_ +#define GKO_REFERENCE_MATRIX_BATCH_DENSE_KERNELS_HPP_ + + #include #include @@ -128,3 +132,6 @@ inline void add_scaled_identity( } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko + + +#endif diff --git a/reference/matrix/batch_ell_kernels.hpp b/reference/matrix/batch_ell_kernels.hpp index 71bd1ce851a..cfdc2040d8f 100644 --- a/reference/matrix/batch_ell_kernels.hpp +++ b/reference/matrix/batch_ell_kernels.hpp @@ -2,6 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_REFERENCE_MATRIX_BATCH_ELL_KERNELS_HPP_ +#define GKO_REFERENCE_MATRIX_BATCH_ELL_KERNELS_HPP_ + + #include #include @@ -114,3 +118,6 @@ inline void add_scaled_identity( } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko + + +#endif From 0e6106b7f93e9bdcf0d6850bd167c13855de3828 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Sat, 24 Aug 2024 00:01:04 +0200 Subject: [PATCH 161/448] [cuda, hip] move inc to full headers --- ...els.hpp.inc => batch_bicgstab_kernels.hpp} | 96 +++++++++++++------ ...g_kernels.hpp.inc => batch_cg_kernels.hpp} | 75 +++++++++++---- cuda/solver/batch_bicgstab_kernels.cu | 31 +++--- cuda/solver/batch_cg_kernels.cu | 31 +++--- hip/solver/batch_bicgstab_kernels.hip.cpp | 13 +-- hip/solver/batch_cg_kernels.hip.cpp | 13 +-- 6 files changed, 151 insertions(+), 108 deletions(-) rename common/cuda_hip/solver/{batch_bicgstab_kernels.hpp.inc => batch_bicgstab_kernels.hpp} (81%) rename common/cuda_hip/solver/{batch_cg_kernels.hpp.inc => batch_cg_kernels.hpp} (78%) diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp similarity index 81% rename from common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc rename to common/cuda_hip/solver/batch_bicgstab_kernels.hpp index d4ce149d394..cbab8ed6961 100644 --- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc +++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp @@ -2,6 +2,42 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ +#define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ + + +#include +#include + +#include +#include +#include + +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template __device__ __forceinline__ void initialize( Group subgroup, const int num_rows, const BatchMatrixType_entry& mat_entry, @@ -27,20 +63,18 @@ __device__ __forceinline__ void initialize( __syncthreads(); // r = b - A*x - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply( + batch_single_kernels::advanced_apply( static_cast(-1.0), mat_entry, x_shared_entry, static_cast(1.0), r_shared_entry); __syncthreads(); if (threadIdx.x / config::warp_size == 0) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_norm2(subgroup, num_rows, r_shared_entry, - res_norm); + batch_single_kernels::single_rhs_compute_norm2( + subgroup, num_rows, r_shared_entry, res_norm); } else if (threadIdx.x / config::warp_size == 1) { // Compute norms of rhs - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, - rhs_norm); + batch_single_kernels::single_rhs_compute_norm2( + subgroup, num_rows, b_global_entry, rhs_norm); } __syncthreads(); @@ -75,9 +109,8 @@ __device__ __forceinline__ void compute_alpha( const ValueType* const v_shared_entry, ValueType& alpha) { if (threadIdx.x / config::warp_size == 0) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_shared_entry, - v_shared_entry, alpha); + batch_single_kernels::single_rhs_compute_conj_dot( + subgroup, num_rows, r_hat_shared_entry, v_shared_entry, alpha); } __syncthreads(); if (threadIdx.x == 0) { @@ -105,13 +138,11 @@ __device__ __forceinline__ void compute_omega( const ValueType* const s_shared_entry, ValueType& temp, ValueType& omega) { if (threadIdx.x / config::warp_size == 0) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry, - s_shared_entry, omega); + batch_single_kernels::single_rhs_compute_conj_dot( + subgroup, num_rows, t_shared_entry, s_shared_entry, omega); } else if (threadIdx.x / config::warp_size == 1) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry, - t_shared_entry, temp); + batch_single_kernels::single_rhs_compute_conj_dot( + subgroup, num_rows, t_shared_entry, t_shared_entry, temp); } __syncthreads(); @@ -279,9 +310,8 @@ __global__ void apply_kernel( // rho_new = < r_hat , r > = (r_hat)' * (r) if (threadIdx.x / config::warp_size == 0) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_sh, - r_sh, rho_new_sh[0]); + batch_single_kernels::single_rhs_compute_conj_dot( + subgroup, num_rows, r_hat_sh, r_sh, rho_new_sh[0]); } __syncthreads(); @@ -296,8 +326,7 @@ __global__ void apply_kernel( __syncthreads(); // v = A * p_hat - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - simple_apply(mat_entry, p_hat_sh, v_sh); + batch_single_kernels::simple_apply(mat_entry, p_hat_sh, v_sh); __syncthreads(); // alpha = rho_new / < r_hat , v> @@ -311,9 +340,8 @@ __global__ void apply_kernel( // an estimate of residual norms if (threadIdx.x / config::warp_size == 0) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_norm2(subgroup, num_rows, s_sh, - norms_res_sh[0]); + batch_single_kernels::single_rhs_compute_norm2( + subgroup, num_rows, s_sh, norms_res_sh[0]); } __syncthreads(); @@ -329,8 +357,7 @@ __global__ void apply_kernel( __syncthreads(); // t = A * s_hat - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - simple_apply(mat_entry, s_hat_sh, t_sh); + batch_single_kernels::simple_apply(mat_entry, s_hat_sh, t_sh); __syncthreads(); // omega = / @@ -345,9 +372,8 @@ __global__ void apply_kernel( __syncthreads(); if (threadIdx.x / config::warp_size == 0) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_norm2(subgroup, num_rows, r_sh, - norms_res_sh[0]); + batch_single_kernels::single_rhs_compute_norm2( + subgroup, num_rows, r_sh, norms_res_sh[0]); } //__syncthreads(); @@ -360,8 +386,16 @@ __global__ void apply_kernel( logger.log_iteration(batch_id, iter, norms_res_sh[0]); // copy x back to global memory - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr); + batch_single_kernels::single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr); __syncthreads(); } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc b/common/cuda_hip/solver/batch_cg_kernels.hpp similarity index 78% rename from common/cuda_hip/solver/batch_cg_kernels.hpp.inc rename to common/cuda_hip/solver/batch_cg_kernels.hpp index 4f4b382f552..e7ec0505844 100644 --- a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc +++ b/common/cuda_hip/solver/batch_cg_kernels.hpp @@ -2,6 +2,42 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_SOLVER_BATCH_CG_KERNELS_HPP_ +#define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_CG_KERNELS_HPP_ + + +#include +#include + +#include +#include +#include + +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template __device__ __forceinline__ void initialize( @@ -22,7 +58,7 @@ __device__ __forceinline__ void initialize( __syncthreads(); // r = b - A*x - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply( + batch_single_kernels::advanced_apply( static_cast(-1.0), mat_entry, x_shared_entry, static_cast(1.0), r_shared_entry); __syncthreads(); @@ -33,14 +69,13 @@ __device__ __forceinline__ void initialize( if (threadIdx.x / config::warp_size == 0) { // Compute norms of rhs - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, - rhs_norms_sh); + batch_single_kernels::single_rhs_compute_norm2( + subgroup, num_rows, b_global_entry, rhs_norms_sh); } else if (threadIdx.x / config::warp_size == 1) { // rho_old = r' * z - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_conj_dot(subgroup, num_rows, r_shared_entry, - z_shared_entry, rho_old_shared_entry); + batch_single_kernels::single_rhs_compute_conj_dot( + subgroup, num_rows, r_shared_entry, z_shared_entry, + rho_old_shared_entry); } // p = z @@ -72,9 +107,9 @@ __device__ __forceinline__ void update_x_and_r( ValueType* const x_shared_entry, ValueType* const r_shared_entry) { if (threadIdx.x / config::warp_size == 0) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_conj_dot(subgroup, num_rows, p_shared_entry, - Ap_shared_entry, alpha_shared_entry); + batch_single_kernels::single_rhs_compute_conj_dot( + subgroup, num_rows, p_shared_entry, Ap_shared_entry, + alpha_shared_entry); } __syncthreads(); @@ -190,8 +225,7 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf, } // Ap = A * p - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - simple_apply(mat_entry, p_sh, Ap_sh); + batch_single_kernels::simple_apply(mat_entry, p_sh, Ap_sh); __syncthreads(); // alpha = rho_old / (p' * Ap) @@ -207,9 +241,8 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf, if (threadIdx.x / config::warp_size == 0) { // rho_new = (r)' * (z) - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_conj_dot(subgroup, num_rows, r_sh, z_sh, - rho_new_sh[0]); + batch_single_kernels::single_rhs_compute_conj_dot( + subgroup, num_rows, r_sh, z_sh, rho_new_sh[0]); } __syncthreads(); @@ -228,8 +261,16 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf, logger.log_iteration(batch_id, iter, norms_res_sh[0]); // copy x back to global memory - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_copy(num_rows, x_sh, x_global_entry); + batch_single_kernels::single_rhs_copy(num_rows, x_sh, x_global_entry); __syncthreads(); } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 35d567fd911..d3dc8712201 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -24,6 +24,7 @@ #include "common/cuda_hip/matrix/batch_dense_kernels.hpp" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" +#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" @@ -32,19 +33,9 @@ namespace gko { namespace kernels { namespace cuda { - - -/** - * @brief The batch Bicgstab solver namespace. - * - * @ingroup batch_bicgstab - */ namespace batch_bicgstab { -#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc" - - template int get_num_threads_per_block(std::shared_ptr exec, @@ -56,9 +47,10 @@ int get_num_threads_per_block(std::shared_ptr exec, const int device_max_threads = ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz; cudaFuncAttributes funcattr; - cudaFuncGetAttributes(&funcattr, - apply_kernel); + cudaFuncGetAttributes( + &funcattr, + batch_single_kernels::apply_kernel); const int num_regs_used = funcattr.numRegs; int max_regs_blk = 0; cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, @@ -80,13 +72,14 @@ int get_max_dynamic_shared_memory(std::shared_ptr exec) cudaDevAttrMaxSharedMemoryPerMultiprocessor, exec->get_device_id()); GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( - apply_kernel, + batch_single_kernels::apply_kernel, cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); cudaFuncAttributes funcattr; - cudaFuncGetAttributes(&funcattr, - apply_kernel); + cudaFuncGetAttributes( + &funcattr, + batch_single_kernels::apply_kernel); return funcattr.maxDynamicSharedSizeBytes; } @@ -116,7 +109,7 @@ public: value_type* const __restrict__ workspace_data, const int& block_size, const size_t& shared_size) const { - apply_kernel + batch_single_kernels::apply_kernel <<get_stream()>>>(sconf, settings_.max_iterations, settings_.residual_tol, logger, prec, mat, diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index f26f2d37313..b8ead675a3c 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -23,6 +23,7 @@ #include "common/cuda_hip/matrix/batch_dense_kernels.hpp" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" +#include "common/cuda_hip/solver/batch_cg_kernels.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" @@ -31,19 +32,9 @@ namespace gko { namespace kernels { namespace cuda { - - -/** - * @brief The batch Cg solver namespace. - * - * @ingroup batch_cg - */ namespace batch_cg { -#include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc" - - template int get_num_threads_per_block(std::shared_ptr exec, @@ -55,9 +46,10 @@ int get_num_threads_per_block(std::shared_ptr exec, const int device_max_threads = (std::max(num_rows, min_block_size) / warp_sz) * warp_sz; cudaFuncAttributes funcattr; - cudaFuncGetAttributes(&funcattr, - apply_kernel); + cudaFuncGetAttributes( + &funcattr, + batch_single_kernels::apply_kernel); const int num_regs_used = funcattr.numRegs; int max_regs_blk = 0; cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, @@ -79,13 +71,14 @@ int get_max_dynamic_shared_memory(std::shared_ptr exec) cudaDevAttrMaxSharedMemoryPerMultiprocessor, exec->get_device_id()); GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( - apply_kernel, + batch_single_kernels::apply_kernel, cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); cudaFuncAttributes funcattr; - cudaFuncGetAttributes(&funcattr, - apply_kernel); + cudaFuncGetAttributes( + &funcattr, + batch_single_kernels::apply_kernel); return funcattr.maxDynamicSharedSizeBytes; } @@ -115,7 +108,7 @@ public: value_type* const __restrict__ workspace_data, const int& block_size, const size_t& shared_size) const { - apply_kernel + batch_single_kernels::apply_kernel <<get_stream()>>>(sconf, settings_.max_iterations, settings_.residual_tol, logger, prec, mat, diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index a5de10953bc..d44bc4a0eb6 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -25,6 +25,7 @@ #include "common/cuda_hip/matrix/batch_dense_kernels.hpp" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" +#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" @@ -33,19 +34,9 @@ namespace gko { namespace kernels { namespace hip { - - -/** - * @brief The batch Bicgstab solver namespace. - * - * @ingroup batch_bicgstab - */ namespace batch_bicgstab { -#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc" - - template int get_num_threads_per_block(std::shared_ptr exec, const int num_rows) @@ -96,7 +87,7 @@ class kernel_caller { value_type* const __restrict__ workspace_data, const int& block_size, const size_t& shared_size) const { - apply_kernel + batch_single_kernels::apply_kernel <<get_stream()>>>(sconf, settings_.max_iterations, settings_.residual_tol, logger, prec, mat, diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index 23bb939ead8..c9a1e81be81 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -25,6 +25,7 @@ #include "common/cuda_hip/matrix/batch_dense_kernels.hpp" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" +#include "common/cuda_hip/solver/batch_cg_kernels.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" @@ -33,19 +34,9 @@ namespace gko { namespace kernels { namespace hip { - - -/** - * @brief The batch Cg solver namespace. - * - * @ingroup batch_cg - */ namespace batch_cg { -#include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc" - - template int get_num_threads_per_block(std::shared_ptr exec, const int num_rows) @@ -96,7 +87,7 @@ class kernel_caller { value_type* const __restrict__ workspace_data, const int& block_size, const size_t& shared_size) const { - apply_kernel + batch_single_kernels::apply_kernel <<get_stream()>>>(sconf, settings_.max_iterations, settings_.residual_tol, logger, prec, mat, From 9dafdbfee3fa992fe303461df175746078606299 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Sat, 24 Aug 2024 00:27:51 +0200 Subject: [PATCH 162/448] [ref, omp] move kernels to headers --- omp/solver/batch_bicgstab_kernels.cpp | 17 +--- omp/solver/batch_cg_kernels.cpp | 14 ++- reference/matrix/batch_dense_kernels.hpp | 2 - reference/matrix/batch_ell_kernels.hpp | 2 +- reference/solver/batch_bicgstab_kernels.cpp | 8 +- ...els.hpp.inc => batch_bicgstab_kernels.hpp} | 88 ++++++++++++------- reference/solver/batch_cg_kernels.cpp | 11 ++- ...g_kernels.hpp.inc => batch_cg_kernels.hpp} | 66 ++++++++++---- 8 files changed, 123 insertions(+), 85 deletions(-) rename reference/solver/{batch_bicgstab_kernels.hpp.inc => batch_bicgstab_kernels.hpp} (84%) rename reference/solver/{batch_cg_kernels.hpp.inc => batch_cg_kernels.hpp} (80%) diff --git a/omp/solver/batch_bicgstab_kernels.cpp b/omp/solver/batch_bicgstab_kernels.cpp index 661cdbcd2ec..ed880507116 100644 --- a/omp/solver/batch_bicgstab_kernels.cpp +++ b/omp/solver/batch_bicgstab_kernels.cpp @@ -13,22 +13,13 @@ #include "reference/matrix/batch_csr_kernels.hpp" #include "reference/matrix/batch_dense_kernels.hpp" #include "reference/matrix/batch_ell_kernels.hpp" +#include "reference/solver/batch_bicgstab_kernels.hpp" namespace gko { namespace kernels { namespace omp { namespace batch_bicgstab { -namespace { - - -constexpr int max_num_rhs = 1; - - -#include "reference/solver/batch_bicgstab_kernels.hpp.inc" - - -} // unnamed namespace template @@ -54,7 +45,7 @@ class kernel_caller { const size_type num_batch_items = mat.num_batch_items; const auto num_rows = mat.num_rows; const auto num_rhs = b.num_rhs; - if (num_rhs > max_num_rhs) { + if (num_rhs > 1) { GKO_NOT_IMPLEMENTED; } @@ -73,8 +64,8 @@ class kernel_caller { exec_, local_size_bytes, local_space.get_data() + omp_get_thread_num() * local_size_bytes); - batch_entry_bicgstab_impl( + batch_single_kernels::batch_entry_bicgstab_impl< + StopType, PrecondType, LogType, BatchMatrixType, ValueType>( settings_, logger, precond, mat, b, x, batch_id, thread_local_space.get_data()); } diff --git a/omp/solver/batch_cg_kernels.cpp b/omp/solver/batch_cg_kernels.cpp index 3a6e31256c2..89d4441db64 100644 --- a/omp/solver/batch_cg_kernels.cpp +++ b/omp/solver/batch_cg_kernels.cpp @@ -13,6 +13,7 @@ #include "reference/matrix/batch_csr_kernels.hpp" #include "reference/matrix/batch_dense_kernels.hpp" #include "reference/matrix/batch_ell_kernels.hpp" +#include "reference/solver/batch_cg_kernels.hpp" namespace gko { @@ -25,9 +26,6 @@ namespace { constexpr int max_num_rhs = 1; -#include "reference/solver/batch_cg_kernels.hpp.inc" - - } // unnamed namespace @@ -54,7 +52,7 @@ class kernel_caller { const size_type num_batch_items = mat.num_batch_items; const auto num_rows = mat.num_rows; const auto num_rhs = b.num_rhs; - if (num_rhs > max_num_rhs) { + if (num_rhs > 1) { GKO_NOT_IMPLEMENTED; } @@ -72,10 +70,10 @@ class kernel_caller { exec_, local_size_bytes, local_space.get_data() + omp_get_thread_num() * local_size_bytes); - batch_entry_cg_impl(settings_, logger, precond, mat, b, - x, batch_id, - thread_local_space.get_data()); + batch_single_kernels::batch_entry_cg_impl< + StopType, PrecondType, LogType, BatchMatrixType, ValueType>( + settings_, logger, precond, mat, b, x, batch_id, + thread_local_space.get_data()); } } diff --git a/reference/matrix/batch_dense_kernels.hpp b/reference/matrix/batch_dense_kernels.hpp index 7fd6a8cdbb5..bc4e7c497cd 100644 --- a/reference/matrix/batch_dense_kernels.hpp +++ b/reference/matrix/batch_dense_kernels.hpp @@ -6,8 +6,6 @@ #define GKO_REFERENCE_MATRIX_BATCH_DENSE_KERNELS_HPP_ -#include - #include #include diff --git a/reference/matrix/batch_ell_kernels.hpp b/reference/matrix/batch_ell_kernels.hpp index cfdc2040d8f..d6892c67f32 100644 --- a/reference/matrix/batch_ell_kernels.hpp +++ b/reference/matrix/batch_ell_kernels.hpp @@ -9,7 +9,7 @@ #include #include -#include +#include #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" diff --git a/reference/solver/batch_bicgstab_kernels.cpp b/reference/solver/batch_bicgstab_kernels.cpp index 33e1e9392d9..20883e24434 100644 --- a/reference/solver/batch_bicgstab_kernels.cpp +++ b/reference/solver/batch_bicgstab_kernels.cpp @@ -9,6 +9,7 @@ #include "reference/matrix/batch_csr_kernels.hpp" #include "reference/matrix/batch_dense_kernels.hpp" #include "reference/matrix/batch_ell_kernels.hpp" +#include "reference/solver/batch_bicgstab_kernels.hpp" namespace gko { @@ -21,9 +22,6 @@ namespace { constexpr int max_num_rhs = 1; -#include "reference/solver/batch_bicgstab_kernels.hpp.inc" - - } // unnamed namespace @@ -62,8 +60,8 @@ class kernel_caller { array local_space(exec_, local_size_bytes); for (size_type batch_id = 0; batch_id < num_batch_items; batch_id++) { - batch_entry_bicgstab_impl( + batch_single_kernels::batch_entry_bicgstab_impl< + StopType, PrecType, LogType, BatchMatrixType, ValueType>( settings_, logger, prec, mat, b, x, batch_id, local_space.get_data()); } diff --git a/reference/solver/batch_bicgstab_kernels.hpp.inc b/reference/solver/batch_bicgstab_kernels.hpp similarity index 84% rename from reference/solver/batch_bicgstab_kernels.hpp.inc rename to reference/solver/batch_bicgstab_kernels.hpp index 786e98eb5d1..f91e06d2e44 100644 --- a/reference/solver/batch_bicgstab_kernels.hpp.inc +++ b/reference/solver/batch_bicgstab_kernels.hpp @@ -2,6 +2,29 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_REFERENCE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ +#define GKO_REFERENCE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ + + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "reference/base/batch_multi_vector_kernels.hpp" +#include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_csr_kernels.hpp" +#include "reference/matrix/batch_dense_kernels.hpp" +#include "reference/matrix/batch_ell_kernels.hpp" +#include "reference/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + +constexpr int max_num_rhs = 1; + + template inline void initialize( const BatchMatrixType_entry& A_entry, @@ -25,20 +48,18 @@ inline void initialize( alpha_entry.values[0] = one(); // Compute norms of rhs - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - compute_norm2_kernel(b_entry, rhs_norms_entry); + batch_single_kernels::compute_norm2_kernel(b_entry, + rhs_norms_entry); // r = b - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel( - b_entry, r_entry); + batch_single_kernels::copy_kernel(b_entry, r_entry); // r = b - A*x - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply( - static_cast(-1.0), A_entry, gko::batch::to_const(x_entry), - static_cast(1.0), r_entry); - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - compute_norm2_kernel(gko::batch::to_const(r_entry), - res_norms_entry); + batch_single_kernels::advanced_apply(static_cast(-1.0), A_entry, + gko::batch::to_const(x_entry), + static_cast(1.0), r_entry); + batch_single_kernels::compute_norm2_kernel( + gko::batch::to_const(r_entry), res_norms_entry); for (int r = 0; r < p_entry.num_rows; r++) { r_hat_entry.values[r * r_hat_entry.stride] = @@ -78,9 +99,8 @@ inline void compute_alpha( const gko::batch::multi_vector::batch_item& v_entry, const gko::batch::multi_vector::batch_item& alpha_entry) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - compute_dot_product_kernel(r_hat_entry, v_entry, - alpha_entry); + batch_single_kernels::compute_dot_product_kernel( + r_hat_entry, v_entry, alpha_entry); alpha_entry.values[0] = rho_new_entry.values[0] / alpha_entry.values[0]; } @@ -107,10 +127,10 @@ inline void compute_omega( const gko::batch::multi_vector::batch_item& temp_entry, const gko::batch::multi_vector::batch_item& omega_entry) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - compute_dot_product_kernel(t_entry, s_entry, omega_entry); - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - compute_dot_product_kernel(t_entry, t_entry, temp_entry); + batch_single_kernels::compute_dot_product_kernel( + t_entry, s_entry, omega_entry); + batch_single_kernels::compute_dot_product_kernel( + t_entry, t_entry, temp_entry); omega_entry.values[0] /= temp_entry.values[0]; } @@ -253,10 +273,9 @@ inline void batch_entry_bicgstab_impl( } // rho_new = < r_hat , r > = (r_hat)' * (r) - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - compute_dot_product_kernel( - gko::batch::to_const(r_hat_entry), - gko::batch::to_const(r_entry), rho_new_entry); + batch_single_kernels::compute_dot_product_kernel( + gko::batch::to_const(r_hat_entry), gko::batch::to_const(r_entry), + rho_new_entry); // beta = (rho_new / rho_old)*(alpha / omega) // p = r + beta*(p - omega * v) @@ -271,7 +290,7 @@ inline void batch_entry_bicgstab_impl( prec.apply(gko::batch::to_const(p_entry), p_hat_entry); // v = A * p_hat - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply( + batch_single_kernels::simple_apply( A_entry, gko::batch::to_const(p_hat_entry), v_entry); // alpha = rho_new / < r_hat , v> @@ -285,9 +304,8 @@ inline void batch_entry_bicgstab_impl( gko::batch::to_const(v_entry), s_entry); // an estimate of residual norms - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - compute_norm2_kernel(gko::batch::to_const(s_entry), - res_norms_entry); + batch_single_kernels::compute_norm2_kernel( + gko::batch::to_const(s_entry), res_norms_entry); if (stop.check_converged(res_norms_entry.values)) { // update x for the systems @@ -303,7 +321,7 @@ inline void batch_entry_bicgstab_impl( prec.apply(gko::batch::to_const(s_entry), s_hat_entry); // t = A * s_hat - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply( + batch_single_kernels::simple_apply( A_entry, gko::batch::to_const(s_hat_entry), t_entry); // omega = / compute_omega(gko::batch::to_const(t_entry), @@ -319,14 +337,22 @@ inline void batch_entry_bicgstab_impl( gko::batch::to_const(s_entry), gko::batch::to_const(t_entry), x_entry, r_entry); - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - compute_norm2_kernel(gko::batch::to_const(r_entry), - res_norms_entry); + batch_single_kernels::compute_norm2_kernel( + gko::batch::to_const(r_entry), res_norms_entry); // rho_old = rho_new - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel( - gko::batch::to_const(rho_new_entry), rho_old_entry); + batch_single_kernels::copy_kernel(gko::batch::to_const(rho_new_entry), + rho_old_entry); } logger.log_iteration(batch_item_id, iter, res_norms_entry.values[0]); } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif diff --git a/reference/solver/batch_cg_kernels.cpp b/reference/solver/batch_cg_kernels.cpp index 7c69157d4a7..f2155f98719 100644 --- a/reference/solver/batch_cg_kernels.cpp +++ b/reference/solver/batch_cg_kernels.cpp @@ -9,6 +9,7 @@ #include "reference/matrix/batch_csr_kernels.hpp" #include "reference/matrix/batch_dense_kernels.hpp" #include "reference/matrix/batch_ell_kernels.hpp" +#include "reference/solver/batch_cg_kernels.hpp" namespace gko { @@ -21,9 +22,6 @@ namespace { constexpr int max_num_rhs = 1; -#include "reference/solver/batch_cg_kernels.hpp.inc" - - } // unnamed namespace @@ -62,9 +60,10 @@ class kernel_caller { array local_space(exec_, local_size_bytes); for (size_type batch_id = 0; batch_id < num_batch_items; batch_id++) { - batch_entry_cg_impl(settings_, logger, prec, mat, b, x, - batch_id, local_space.get_data()); + batch_single_kernels::batch_entry_cg_impl< + StopType, PrecType, LogType, BatchMatrixType, ValueType>( + settings_, logger, prec, mat, b, x, batch_id, + local_space.get_data()); } } diff --git a/reference/solver/batch_cg_kernels.hpp.inc b/reference/solver/batch_cg_kernels.hpp similarity index 80% rename from reference/solver/batch_cg_kernels.hpp.inc rename to reference/solver/batch_cg_kernels.hpp index 991db5c061c..d4a35e3d01a 100644 --- a/reference/solver/batch_cg_kernels.hpp.inc +++ b/reference/solver/batch_cg_kernels.hpp @@ -2,6 +2,29 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_REFERENCE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ +#define GKO_REFERENCE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ + + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "reference/base/batch_multi_vector_kernels.hpp" +#include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_csr_kernels.hpp" +#include "reference/matrix/batch_dense_kernels.hpp" +#include "reference/matrix/batch_ell_kernels.hpp" +#include "reference/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + +constexpr int max_num_rhs = 1; + + template inline void initialize( const BatchMatrixType_entry& A_entry, @@ -26,17 +49,16 @@ inline void initialize( } // Compute norms of rhs - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - compute_norm2_kernel(b_entry, rhs_norms_entry); + batch_single_kernels::compute_norm2_kernel(b_entry, + rhs_norms_entry); // r = b - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel( - b_entry, r_entry); + batch_single_kernels::copy_kernel(b_entry, r_entry); // r = b - A*x - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply( - static_cast(-1.0), A_entry, gko::batch::to_const(x_entry), - static_cast(1.0), r_entry); + batch_single_kernels::advanced_apply(static_cast(-1.0), A_entry, + gko::batch::to_const(x_entry), + static_cast(1.0), r_entry); } @@ -48,8 +70,7 @@ inline void update_p( const gko::batch::multi_vector::batch_item& p_entry) { if (rho_old_entry.values[0] == zero()) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel( - z_entry, p_entry); + batch_single_kernels::copy_kernel(z_entry, p_entry); return; } const ValueType beta = rho_new_entry.values[0] / rho_old_entry.values[0]; @@ -70,9 +91,8 @@ inline void update_x_and_r( const gko::batch::multi_vector::batch_item& x_entry, const gko::batch::multi_vector::batch_item& r_entry) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - compute_conj_dot_product_kernel(p_entry, Ap_entry, - alpha_entry); + batch_single_kernels::compute_conj_dot_product_kernel( + p_entry, Ap_entry, alpha_entry); const ValueType temp = rho_old_entry.values[0] / alpha_entry.values[0]; for (int row = 0; row < r_entry.num_rows; row++) { @@ -159,10 +179,9 @@ inline void batch_entry_cg_impl( prec.apply(gko::batch::to_const(r_entry), z_entry); // rho_new = < r , z > = (r)' * (z) - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - compute_conj_dot_product_kernel( - gko::batch::to_const(r_entry), gko::batch::to_const(z_entry), - rho_new_entry); + batch_single_kernels::compute_conj_dot_product_kernel( + gko::batch::to_const(r_entry), gko::batch::to_const(z_entry), + rho_new_entry); ++iter; // use implicit residual norms res_norms_entry.values[0] = sqrt(abs(rho_new_entry.values[0])); @@ -181,7 +200,7 @@ inline void batch_entry_cg_impl( gko::batch::to_const(z_entry), p_entry); // Ap = A * p - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply( + batch_single_kernels::simple_apply( A_entry, gko::batch::to_const(p_entry), Ap_entry); // temp= rho_old / (p' * Ap) @@ -192,9 +211,18 @@ inline void batch_entry_cg_impl( gko::batch::to_const(Ap_entry), alpha_entry, x_entry, r_entry); // rho_old = rho_new - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel( - gko::batch::to_const(rho_new_entry), rho_old_entry); + batch_single_kernels::copy_kernel(gko::batch::to_const(rho_new_entry), + rho_old_entry); } logger.log_iteration(batch_item_id, iter, res_norms_entry.values[0]); } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif From fce448ed87a8fb87068265244de2a94f1d2443fc Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Sat, 24 Aug 2024 00:36:46 +0200 Subject: [PATCH 163/448] [dpcpp] move to headers --- dpcpp/solver/batch_bicgstab_kernels.dp.cpp | 7 +- ...els.hpp.inc => batch_bicgstab_kernels.hpp} | 41 +++++++++++ dpcpp/solver/batch_cg_kernels.dp.cpp | 7 +- ...g_kernels.hpp.inc => batch_cg_kernels.hpp} | 72 ++++++++++++++----- 4 files changed, 101 insertions(+), 26 deletions(-) rename dpcpp/solver/{batch_bicgstab_kernels.hpp.inc => batch_bicgstab_kernels.hpp} (93%) rename dpcpp/solver/{batch_cg_kernels.hpp.inc => batch_cg_kernels.hpp} (82%) diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp index 291ee1d8a8b..7036b770f1b 100644 --- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp +++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp @@ -27,6 +27,7 @@ #include "dpcpp/matrix/batch_dense_kernels.hpp" #include "dpcpp/matrix/batch_ell_kernels.hpp" #include "dpcpp/matrix/batch_struct.hpp" +#include "dpcpp/solver/batch_bicgstab_kernels.hpp" namespace gko { @@ -35,9 +36,6 @@ namespace dpcpp { namespace batch_bicgstab { -#include "dpcpp/solver/batch_bicgstab_kernels.hpp.inc" - - template using settings = gko::kernels::batch_bicgstab::settings; @@ -95,7 +93,8 @@ class kernel_caller { ValueType* const x_global_entry = gko::batch::multi_vector::batch_item_ptr( x_values, 1, num_rows, batch_id); - apply_kernel( + batch_single_kernels::apply_kernel( sconf, max_iters, res_tol, logger, prec, mat_global_entry, b_global_entry, x_global_entry, num_rows, mat.get_single_item_num_nnz(), diff --git a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc b/dpcpp/solver/batch_bicgstab_kernels.hpp similarity index 93% rename from dpcpp/solver/batch_bicgstab_kernels.hpp.inc rename to dpcpp/solver/batch_bicgstab_kernels.hpp index de1956c8c6c..a6db9e7470a 100644 --- a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc +++ b/dpcpp/solver/batch_bicgstab_kernels.hpp @@ -2,6 +2,38 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_DPCPP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ +#define GKO_DPCPP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ + + +#include + +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "dpcpp/base/batch_multi_vector_kernels.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_csr_kernels.hpp" +#include "dpcpp/matrix/batch_dense_kernels.hpp" +#include "dpcpp/matrix/batch_ell_kernels.hpp" +#include "dpcpp/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template __dpct_inline__ void initialize( const int num_rows, const BatchMatrixType_entry& mat_global_entry, @@ -393,3 +425,12 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf, num_rows, x_sh, x_global_entry, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp index 05b3f7b803c..9d3aa14ab2c 100644 --- a/dpcpp/solver/batch_cg_kernels.dp.cpp +++ b/dpcpp/solver/batch_cg_kernels.dp.cpp @@ -27,6 +27,7 @@ #include "dpcpp/matrix/batch_dense_kernels.hpp" #include "dpcpp/matrix/batch_ell_kernels.hpp" #include "dpcpp/matrix/batch_struct.hpp" +#include "dpcpp/solver/batch_cg_kernels.hpp" namespace gko { @@ -35,9 +36,6 @@ namespace dpcpp { namespace batch_cg { -#include "dpcpp/solver/batch_cg_kernels.hpp.inc" - - template using settings = gko::kernels::batch_cg::settings; @@ -95,7 +93,8 @@ class kernel_caller { ValueType* const x_global_entry = gko::batch::multi_vector::batch_item_ptr( x_values, 1, num_rows, batch_id); - apply_kernel( + batch_single_kernels::apply_kernel( sconf, max_iters, res_tol, logger, prec, mat_global_entry, b_global_entry, x_global_entry, num_rows, mat.get_single_item_num_nnz(), diff --git a/dpcpp/solver/batch_cg_kernels.hpp.inc b/dpcpp/solver/batch_cg_kernels.hpp similarity index 82% rename from dpcpp/solver/batch_cg_kernels.hpp.inc rename to dpcpp/solver/batch_cg_kernels.hpp index b233b7df680..67df0a17236 100644 --- a/dpcpp/solver/batch_cg_kernels.hpp.inc +++ b/dpcpp/solver/batch_cg_kernels.hpp @@ -2,6 +2,38 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_DPCPP_SOLVER_BATCH_CG_KERNELS_HPP_ +#define GKO_DPCPP_SOLVER_BATCH_CG_KERNELS_HPP_ + + +#include + +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "dpcpp/base/batch_multi_vector_kernels.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_csr_kernels.hpp" +#include "dpcpp/matrix/batch_dense_kernels.hpp" +#include "dpcpp/matrix/batch_ell_kernels.hpp" +#include "dpcpp/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template __dpct_inline__ void initialize( const int num_rows, const BatchMatrixType& mat_global_entry, @@ -27,7 +59,7 @@ __dpct_inline__ void initialize( item_ct1.barrier(sycl::access::fence_space::global_and_local); // r = b - A*x - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply( + batch_single_kernels::advanced_apply( static_cast(-1.0), mat_global_entry, x_shared_entry, static_cast(1.0), r_shared_entry, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); @@ -40,13 +72,11 @@ __dpct_inline__ void initialize( // Compute norms of rhs // and rho_old = r' * z if (sg_id == 0) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norms, - item_ct1); + batch_single_kernels::single_rhs_compute_norm2_sg( + num_rows, b_global_entry, rhs_norms, item_ct1); } else if (sg_id == 1) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_conj_dot_sg(num_rows, r_shared_entry, - z_shared_entry, rho_old, item_ct1); + batch_single_kernels::single_rhs_compute_conj_dot_sg( + num_rows, r_shared_entry, z_shared_entry, rho_old, item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); @@ -82,10 +112,9 @@ __dpct_inline__ void update_x_and_r( auto sg = item_ct1.get_sub_group(); const auto tid = item_ct1.get_local_linear_id(); if (sg.get_group_id() == 0) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_conj_dot_sg(num_rows, p_shared_entry, - Ap_shared_entry, alpha_shared_entry, - item_ct1); + batch_single_kernels::single_rhs_compute_conj_dot_sg( + num_rows, p_shared_entry, Ap_shared_entry, alpha_shared_entry, + item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); if (tid == 0) { @@ -207,8 +236,8 @@ __dpct_inline__ void apply_kernel( break; } // Ap = A * p - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply( - mat_global_entry, p_sh, Ap_sh, item_ct1); + batch_single_kernels::simple_apply(mat_global_entry, p_sh, Ap_sh, + item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); // alpha = rho_old / (p' * Ap) @@ -225,9 +254,8 @@ __dpct_inline__ void apply_kernel( // rho_new = (r)' * (z) if (sg_id == 0) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_conj_dot_sg(num_rows, r_sh, z_sh, - rho_new_sh[0], item_ct1); + batch_single_kernels::single_rhs_compute_conj_dot_sg( + num_rows, r_sh, z_sh, rho_new_sh[0], item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); @@ -244,7 +272,15 @@ __dpct_inline__ void apply_kernel( logger.log_iteration(batch_id, iter, norms_res_sh[0]); // copy x back to global memory - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel( - num_rows, x_sh, x_global_entry, item_ct1); + batch_single_kernels::copy_kernel(num_rows, x_sh, x_global_entry, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif From d018241d4ec868da24c8d4e66baa5f040b2055a1 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Sat, 24 Aug 2024 12:44:59 +0200 Subject: [PATCH 164/448] [cuda, hip] reorg batch preconds --- ..._jacobi.hpp.inc => batch_block_jacobi.hpp} | 47 +++++++++++ .../preconditioner/batch_identity.hpp | 78 +++++++++++++++++++ .../preconditioner/batch_identity.hpp.inc | 33 -------- ...rnels.hpp.inc => batch_jacobi_kernels.hpp} | 45 +++++++++++ .../preconditioner/batch_preconditioners.hpp | 17 ++++ ...jacobi.hpp.inc => batch_scalar_jacobi.hpp} | 46 +++++++++++ core/solver/batch_dispatch.hpp | 6 +- cuda/preconditioner/batch_jacobi_kernels.cu | 20 ++--- cuda/preconditioner/batch_preconditioners.cuh | 32 -------- .../batch_jacobi_kernels.hip.cpp | 18 +++-- .../batch_preconditioners.hip.hpp | 32 -------- 11 files changed, 255 insertions(+), 119 deletions(-) rename common/cuda_hip/preconditioner/{batch_block_jacobi.hpp.inc => batch_block_jacobi.hpp} (81%) create mode 100644 common/cuda_hip/preconditioner/batch_identity.hpp delete mode 100644 common/cuda_hip/preconditioner/batch_identity.hpp.inc rename common/cuda_hip/preconditioner/{batch_jacobi_kernels.hpp.inc => batch_jacobi_kernels.hpp} (87%) create mode 100644 common/cuda_hip/preconditioner/batch_preconditioners.hpp rename common/cuda_hip/preconditioner/{batch_scalar_jacobi.hpp.inc => batch_scalar_jacobi.hpp} (77%) delete mode 100644 cuda/preconditioner/batch_preconditioners.cuh delete mode 100644 hip/preconditioner/batch_preconditioners.hip.hpp diff --git a/common/cuda_hip/preconditioner/batch_block_jacobi.hpp.inc b/common/cuda_hip/preconditioner/batch_block_jacobi.hpp similarity index 81% rename from common/cuda_hip/preconditioner/batch_block_jacobi.hpp.inc rename to common/cuda_hip/preconditioner/batch_block_jacobi.hpp index 124f1ee93a1..5aff975e960 100644 --- a/common/cuda_hip/preconditioner/batch_block_jacobi.hpp.inc +++ b/common/cuda_hip/preconditioner/batch_block_jacobi.hpp @@ -2,6 +2,44 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_BLOCK_JACOBI_HPP_ +#define GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_BLOCK_JACOBI_HPP_ + + +#include +#include + +#include +#include +#include +#include + +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" +#include "core/preconditioner/batch_jacobi_helpers.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_preconditioner { + + /** * BlockJacobi preconditioner for batch solvers. */ @@ -173,3 +211,12 @@ class BlockJacobi final { const int* __restrict__ const block_ptrs_arr_; const int* __restrict__ const row_block_map_; }; + + +} // namespace batch_preconditioner +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif diff --git a/common/cuda_hip/preconditioner/batch_identity.hpp b/common/cuda_hip/preconditioner/batch_identity.hpp new file mode 100644 index 00000000000..634d3212f36 --- /dev/null +++ b/common/cuda_hip/preconditioner/batch_identity.hpp @@ -0,0 +1,78 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_IDENTITY_HPP_ +#define GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_IDENTITY_HPP_ + + +#include +#include + +#include +#include +#include + +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_preconditioner { + + +/** + * @see reference/preconditioner/batch_identity.hpp + */ +template +class Identity final { +public: + using value_type = ValueType; + + static constexpr int work_size = 0; + + __host__ __device__ static constexpr int dynamic_work_size(int, int) + { + return 0; + } + + template + __device__ __forceinline__ void generate(size_type, const batch_item_type&, + ValueType*) + {} + + __device__ __forceinline__ void apply(const int num_rows, + const ValueType* const r, + ValueType* const z) const + { + for (int li = threadIdx.x; li < num_rows; li += blockDim.x) { + z[li] = r[li]; + } + } +}; + + +} // namespace batch_preconditioner +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif diff --git a/common/cuda_hip/preconditioner/batch_identity.hpp.inc b/common/cuda_hip/preconditioner/batch_identity.hpp.inc deleted file mode 100644 index b85a8b1d7da..00000000000 --- a/common/cuda_hip/preconditioner/batch_identity.hpp.inc +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -/** - * @see reference/preconditioner/batch_identity.hpp - */ -template -class Identity final { -public: - using value_type = ValueType; - - static constexpr int work_size = 0; - - __host__ __device__ static constexpr int dynamic_work_size(int, int) - { - return 0; - } - - template - __device__ __forceinline__ void generate(size_type, const batch_item_type&, - ValueType*) - {} - - __device__ __forceinline__ void apply(const int num_rows, - const ValueType* const r, - ValueType* const z) const - { - for (int li = threadIdx.x; li < num_rows; li += blockDim.x) { - z[li] = r[li]; - } - } -}; diff --git a/common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp.inc b/common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp similarity index 87% rename from common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp.inc rename to common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp index a26a2077c2d..ac9143fefb9 100644 --- a/common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp.inc +++ b/common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp @@ -2,6 +2,42 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_JACOBI_KERNELS_HPP_ +#define GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_JACOBI_KERNELS_HPP_ + + +#include +#include + +#include +#include +#include + +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + __global__ void compute_block_storage_kernel( const gko::size_type num_blocks, const int* const __restrict__ block_pointers, @@ -243,3 +279,12 @@ __launch_bounds__(default_block_size) void compute_block_jacobi_kernel( } } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif diff --git a/common/cuda_hip/preconditioner/batch_preconditioners.hpp b/common/cuda_hip/preconditioner/batch_preconditioners.hpp new file mode 100644 index 00000000000..fc1d3fd2c9e --- /dev/null +++ b/common/cuda_hip/preconditioner/batch_preconditioners.hpp @@ -0,0 +1,17 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HPP_ +#define GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HPP_ + + +#include "common/cuda_hip/preconditioner/batch_block_jacobi.hpp" +#include "common/cuda_hip/preconditioner/batch_identity.hpp" +#include "common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/preconditioner/batch_jacobi_helpers.hpp" + + +#endif // GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HPP_ diff --git a/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp.inc b/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp similarity index 77% rename from common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp.inc rename to common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp index 751c2696e15..695d31235a8 100644 --- a/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp.inc +++ b/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp @@ -2,6 +2,43 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_SCALAR_JACOBI_HPP_ +#define GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_SCALAR_JACOBI_HPP_ + + +#include +#include + +#include +#include +#include +#include + +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" +#include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" +#include "common/cuda_hip/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_preconditioner { + + /** * (Scalar) Jacobi preconditioner for batch solvers. */ @@ -132,3 +169,12 @@ class ScalarJacobi final { private: value_type* __restrict__ work_; }; + + +} // namespace batch_preconditioner +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp index 599c708b334..178f6b1beae 100644 --- a/core/solver/batch_dispatch.hpp +++ b/core/solver/batch_dispatch.hpp @@ -26,9 +26,8 @@ #include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" -#include "cuda/components/cooperative_groups.cuh" +#include "common/cuda_hip/preconditioner/batch_preconditioners.hpp" #include "cuda/log/batch_logger.cuh" -#include "cuda/preconditioner/batch_preconditioners.cuh" #include "cuda/stop/batch_criteria.cuh" @@ -54,9 +53,8 @@ using DeviceValueType = typename gko::kernels::cuda::cuda_type; #include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" -#include "hip/components/cooperative_groups.hip.hpp" +#include "common/cuda_hip/preconditioner/batch_preconditioners.hpp" #include "hip/log/batch_logger.hip.hpp" -#include "hip/preconditioner/batch_preconditioners.hip.hpp" #include "hip/stop/batch_criteria.hip.hpp" diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu index edf052cb649..8768937dc6d 100644 --- a/cuda/preconditioner/batch_jacobi_kernels.cu +++ b/cuda/preconditioner/batch_jacobi_kernels.cu @@ -12,6 +12,7 @@ #include "common/cuda_hip/components/intrinsics.hpp" #include "common/cuda_hip/components/thread_ids.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" +#include "common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp" #include "core/base/batch_struct.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" @@ -29,8 +30,6 @@ namespace gko { namespace kernels { namespace cuda { namespace batch_jacobi { - - namespace { @@ -39,8 +38,6 @@ constexpr int default_block_size = 128; using batch_jacobi_cuda_compiled_max_block_sizes = gko::kernels::cuda::jacobi::compiled_kernels; -#include "common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp.inc" - } // namespace @@ -54,8 +51,9 @@ void compute_cumulative_block_storage( dim3 block(default_block_size); dim3 grid(ceildiv(num_blocks, default_block_size)); - compute_block_storage_kernel<<get_stream()>>>( - num_blocks, block_pointers, blocks_cumulative_offsets); + batch_single_kernels:: + compute_block_storage_kernel<<get_stream()>>>( + num_blocks, block_pointers, blocks_cumulative_offsets); components::prefix_sum_nonnegative(exec, blocks_cumulative_offsets, num_blocks + 1); @@ -73,8 +71,9 @@ void find_row_block_map(std::shared_ptr exec, { dim3 block(default_block_size); dim3 grid(ceildiv(num_blocks, default_block_size)); - find_row_block_map_kernel<<get_stream()>>>( - num_blocks, block_pointers, map_block_to_row); + batch_single_kernels:: + find_row_block_map_kernel<<get_stream()>>>( + num_blocks, block_pointers, map_block_to_row); } GKO_INSTANTIATE_FOR_INT32_TYPE( @@ -93,7 +92,8 @@ void extract_common_blocks_pattern( dim3 block(default_block_size); dim3 grid(ceildiv(nrows * config::warp_size, default_block_size)); - extract_common_block_pattern_kernel<<get_stream()>>>( + batch_single_kernels::extract_common_block_pattern_kernel<<< + grid, block, 0, exec->get_stream()>>>( static_cast(nrows), first_sys_csr->get_const_row_ptrs(), first_sys_csr->get_const_col_idxs(), num_blocks, cumulative_block_storage, block_pointers, map_block_to_row, @@ -125,7 +125,7 @@ void compute_block_jacobi_helper( dim3 block(default_block_size); dim3 grid(ceildiv(num_blocks * nbatch * subwarp_size, default_block_size)); - compute_block_jacobi_kernel + batch_single_kernels::compute_block_jacobi_kernel <<get_stream()>>>( nbatch, static_cast(nnz), as_cuda_type(sys_csr->get_const_values()), num_blocks, diff --git a/cuda/preconditioner/batch_preconditioners.cuh b/cuda/preconditioner/batch_preconditioners.cuh deleted file mode 100644 index 01001c036b2..00000000000 --- a/cuda/preconditioner/batch_preconditioners.cuh +++ /dev/null @@ -1,32 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_PRECONDITIONER_BATCH_PRECONDITIONERS_CUH_ -#define GKO_CUDA_PRECONDITIONER_BATCH_PRECONDITIONERS_CUH_ - - -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "core/matrix/batch_struct.hpp" -#include "core/preconditioner/batch_jacobi_helpers.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace batch_preconditioner { - - -#include "common/cuda_hip/preconditioner/batch_block_jacobi.hpp.inc" -#include "common/cuda_hip/preconditioner/batch_identity.hpp.inc" -#include "common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp.inc" - - -} // namespace batch_preconditioner -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_PRECONDITIONER_BATCH_PRECONDITIONERS_CUH_ diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp index 38a81972e66..2380bc6a0bd 100644 --- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp +++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp @@ -15,6 +15,7 @@ #include "common/cuda_hip/components/uninitialized_array.hpp" #include "common/cuda_hip/components/warp_blas.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" +#include "common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp" #include "core/base/batch_struct.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" @@ -40,8 +41,6 @@ constexpr int default_block_size = 128; using batch_jacobi_hip_compiled_max_block_sizes = gko::kernels::hip::jacobi::compiled_kernels; -#include "common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp.inc" - } // namespace @@ -54,8 +53,9 @@ void compute_cumulative_block_storage( dim3 block(default_block_size); dim3 grid(ceildiv(num_blocks, default_block_size)); - compute_block_storage_kernel<<get_stream()>>>( - num_blocks, block_pointers, blocks_cumulative_offsets); + batch_single_kernels:: + compute_block_storage_kernel<<get_stream()>>>( + num_blocks, block_pointers, blocks_cumulative_offsets); components::prefix_sum_nonnegative(exec, blocks_cumulative_offsets, num_blocks + 1); @@ -73,8 +73,9 @@ void find_row_block_map(std::shared_ptr exec, { dim3 block(default_block_size); dim3 grid(ceildiv(num_blocks, default_block_size)); - find_row_block_map_kernel<<get_stream()>>>( - num_blocks, block_pointers, map_block_to_row); + batch_single_kernels:: + find_row_block_map_kernel<<get_stream()>>>( + num_blocks, block_pointers, map_block_to_row); } GKO_INSTANTIATE_FOR_INT32_TYPE( @@ -93,7 +94,8 @@ void extract_common_blocks_pattern( dim3 block(default_block_size); dim3 grid(ceildiv(nrows * config::warp_size, default_block_size)); - extract_common_block_pattern_kernel<<get_stream()>>>( + batch_single_kernels::extract_common_block_pattern_kernel<<< + grid, block, 0, exec->get_stream()>>>( static_cast(nrows), first_sys_csr->get_const_row_ptrs(), first_sys_csr->get_const_col_idxs(), num_blocks, cumulative_block_storage, block_pointers, map_block_to_row, @@ -126,7 +128,7 @@ void compute_block_jacobi_helper( dim3 block(default_block_size); dim3 grid(ceildiv(num_blocks * nbatch * subwarp_size, default_block_size)); - compute_block_jacobi_kernel + batch_single_kernels::compute_block_jacobi_kernel <<get_stream()>>>( nbatch, static_cast(nnz), as_hip_type(sys_csr->get_const_values()), num_blocks, diff --git a/hip/preconditioner/batch_preconditioners.hip.hpp b/hip/preconditioner/batch_preconditioners.hip.hpp deleted file mode 100644 index f62000ff46f..00000000000 --- a/hip/preconditioner/batch_preconditioners.hip.hpp +++ /dev/null @@ -1,32 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HIP_HPP_ -#define GKO_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HIP_HPP_ - - -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "core/matrix/batch_struct.hpp" -#include "core/preconditioner/batch_jacobi_helpers.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace batch_preconditioner { - - -#include "common/cuda_hip/preconditioner/batch_block_jacobi.hpp.inc" -#include "common/cuda_hip/preconditioner/batch_identity.hpp.inc" -#include "common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp.inc" - - -} // namespace batch_preconditioner -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HIP_HPP_ From f6cc8b1876f4ac83740abd94b24bae83c3c68997 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Sat, 24 Aug 2024 12:45:20 +0200 Subject: [PATCH 165/448] [ref, omp] reorg batch preconds --- omp/preconditioner/batch_jacobi_kernels.cpp | 22 +++++--------- .../preconditioner/batch_jacobi_kernels.cpp | 22 +++++--------- ...rnels.hpp.inc => batch_jacobi_kernels.hpp} | 30 +++++++++++++++++++ 3 files changed, 44 insertions(+), 30 deletions(-) rename reference/preconditioner/{batch_jacobi_kernels.hpp.inc => batch_jacobi_kernels.hpp} (88%) diff --git a/omp/preconditioner/batch_jacobi_kernels.cpp b/omp/preconditioner/batch_jacobi_kernels.cpp index 9dfe06be32b..90c8f0c1865 100644 --- a/omp/preconditioner/batch_jacobi_kernels.cpp +++ b/omp/preconditioner/batch_jacobi_kernels.cpp @@ -10,6 +10,7 @@ #include "reference/base/batch_struct.hpp" #include "reference/matrix/batch_struct.hpp" #include "reference/preconditioner/batch_block_jacobi.hpp" +#include "reference/preconditioner/batch_jacobi_kernels.hpp" #include "reference/preconditioner/batch_scalar_jacobi.hpp" @@ -19,16 +20,6 @@ namespace omp { namespace batch_jacobi { -namespace { - - -// Note: Do not change the ordering -#include "reference/preconditioner/batch_jacobi_kernels.hpp.inc" - - -} // unnamed namespace - - template void compute_cumulative_block_storage( std::shared_ptr exec, const size_type num_blocks, @@ -78,8 +69,9 @@ void extract_common_blocks_pattern( { #pragma omp parallel for for (size_type k = 0; k < num_blocks; k++) { - extract_block_pattern_impl(k, first_sys_csr, cumulative_block_storage, - block_pointers, blocks_pattern); + batch_single_kernels::extract_block_pattern_impl( + k, first_sys_csr, cumulative_block_storage, block_pointers, + blocks_pattern); } } @@ -105,9 +97,9 @@ void compute_block_jacobi( const auto A_entry = gko::batch::matrix::extract_batch_item(A_batch, batch_idx); - compute_block_jacobi_impl(batch_idx, block_idx, A_entry, num_blocks, - cumulative_block_storage, block_pointers, - blocks_pattern, blocks); + batch_single_kernels::compute_block_jacobi_impl( + batch_idx, block_idx, A_entry, num_blocks, cumulative_block_storage, + block_pointers, blocks_pattern, blocks); } } diff --git a/reference/preconditioner/batch_jacobi_kernels.cpp b/reference/preconditioner/batch_jacobi_kernels.cpp index 3c03a21fae7..a012e019b41 100644 --- a/reference/preconditioner/batch_jacobi_kernels.cpp +++ b/reference/preconditioner/batch_jacobi_kernels.cpp @@ -10,6 +10,7 @@ #include "reference/base/batch_struct.hpp" #include "reference/matrix/batch_struct.hpp" #include "reference/preconditioner/batch_block_jacobi.hpp" +#include "reference/preconditioner/batch_jacobi_kernels.hpp" #include "reference/preconditioner/batch_scalar_jacobi.hpp" @@ -19,16 +20,6 @@ namespace reference { namespace batch_jacobi { -namespace { - - -// Note: Do not change the ordering -#include "reference/preconditioner/batch_jacobi_kernels.hpp.inc" - - -} // unnamed namespace - - template void compute_cumulative_block_storage( std::shared_ptr exec, const size_type num_blocks, @@ -74,8 +65,9 @@ void extract_common_blocks_pattern( IndexType* const blocks_pattern) { for (size_type k = 0; k < num_blocks; k++) { - extract_block_pattern_impl(k, first_sys_csr, cumulative_block_storage, - block_pointers, blocks_pattern); + batch_single_kernels::extract_block_pattern_impl( + k, first_sys_csr, cumulative_block_storage, block_pointers, + blocks_pattern); } } @@ -98,9 +90,9 @@ void compute_block_jacobi( for (size_type k = 0; k < num_blocks; k++) { const auto A_entry = gko::batch::matrix::extract_batch_item(A_batch, batch_idx); - compute_block_jacobi_impl(batch_idx, k, A_entry, num_blocks, - cumulative_block_storage, block_pointers, - blocks_pattern, blocks); + batch_single_kernels::compute_block_jacobi_impl( + batch_idx, k, A_entry, num_blocks, cumulative_block_storage, + block_pointers, blocks_pattern, blocks); } } } diff --git a/reference/preconditioner/batch_jacobi_kernels.hpp.inc b/reference/preconditioner/batch_jacobi_kernels.hpp similarity index 88% rename from reference/preconditioner/batch_jacobi_kernels.hpp.inc rename to reference/preconditioner/batch_jacobi_kernels.hpp index 0f04841bc7b..ee44f21eb97 100644 --- a/reference/preconditioner/batch_jacobi_kernels.hpp.inc +++ b/reference/preconditioner/batch_jacobi_kernels.hpp @@ -2,6 +2,27 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_REFERENCE_PRECONDITIONER_BATCH_JACOBI_KERNELS_HPP_ +#define GKO_REFERENCE_PRECONDITIONER_BATCH_JACOBI_KERNELS_HPP_ + + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/preconditioner/batch_jacobi_helpers.hpp" +#include "reference/base/batch_multi_vector_kernels.hpp" +#include "reference/base/batch_struct.hpp" +#include "reference/matrix/batch_csr_kernels.hpp" +#include "reference/matrix/batch_dense_kernels.hpp" +#include "reference/matrix/batch_ell_kernels.hpp" +#include "reference/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + template inline void extract_block_pattern_impl( const size_type k, const matrix::Csr* const first_sys_csr, @@ -164,3 +185,12 @@ inline void compute_block_jacobi_impl( } } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif From a0b83052fef218a465d6b1dfd7f64ee7b171b0b4 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Sat, 24 Aug 2024 12:57:14 +0200 Subject: [PATCH 166/448] [dpcpp] reorg batch preconds --- ..._jacobi.hpp.inc => batch_block_jacobi.hpp} | 40 +++++++++++ dpcpp/preconditioner/batch_identity.hpp | 72 +++++++++++++++++++ dpcpp/preconditioner/batch_identity.hpp.inc | 31 -------- .../batch_jacobi_kernels.dp.cpp | 41 +++++------ ...rnels.hpp.inc => batch_jacobi_kernels.hpp} | 42 +++++++++++ .../preconditioner/batch_preconditioners.hpp | 20 +----- ...jacobi.hpp.inc => batch_scalar_jacobi.hpp} | 42 +++++++++++ 7 files changed, 218 insertions(+), 70 deletions(-) rename dpcpp/preconditioner/{batch_block_jacobi.hpp.inc => batch_block_jacobi.hpp} (81%) create mode 100644 dpcpp/preconditioner/batch_identity.hpp delete mode 100644 dpcpp/preconditioner/batch_identity.hpp.inc rename dpcpp/preconditioner/{batch_jacobi_kernels.hpp.inc => batch_jacobi_kernels.hpp} (87%) rename dpcpp/preconditioner/{batch_scalar_jacobi.hpp.inc => batch_scalar_jacobi.hpp} (82%) diff --git a/dpcpp/preconditioner/batch_block_jacobi.hpp.inc b/dpcpp/preconditioner/batch_block_jacobi.hpp similarity index 81% rename from dpcpp/preconditioner/batch_block_jacobi.hpp.inc rename to dpcpp/preconditioner/batch_block_jacobi.hpp index 442914b3933..b01de33c299 100644 --- a/dpcpp/preconditioner/batch_block_jacobi.hpp.inc +++ b/dpcpp/preconditioner/batch_block_jacobi.hpp @@ -2,6 +2,39 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_DPCPP_PRECONDITIONER_BATCH_BLOCK_JACOBI_HPP_ +#define GKO_DPCPP_PRECONDITIONER_BATCH_BLOCK_JACOBI_HPP_ + + +#include + +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/preconditioner/batch_jacobi_helpers.hpp" +#include "dpcpp/base/batch_multi_vector_kernels.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_csr_kernels.hpp" +#include "dpcpp/matrix/batch_dense_kernels.hpp" +#include "dpcpp/matrix/batch_ell_kernels.hpp" +#include "dpcpp/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_preconditioner { + + /** * BlockJacobi preconditioner for batch solvers. */ @@ -129,3 +162,10 @@ class BlockJacobi final { const int* __restrict__ const block_ptrs_arr_; const int* __restrict__ const row_block_map_; }; + +} // namespace batch_preconditioner +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + +#endif diff --git a/dpcpp/preconditioner/batch_identity.hpp b/dpcpp/preconditioner/batch_identity.hpp new file mode 100644 index 00000000000..0696d028059 --- /dev/null +++ b/dpcpp/preconditioner/batch_identity.hpp @@ -0,0 +1,72 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_DPCPP_PRECONDITIONER_BATCH_IDENTITY_HPP_ +#define GKO_DPCPP_PRECONDITIONER_BATCH_IDENTITY_HPP_ + + +#include + +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "dpcpp/base/batch_multi_vector_kernels.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_csr_kernels.hpp" +#include "dpcpp/matrix/batch_dense_kernels.hpp" +#include "dpcpp/matrix/batch_ell_kernels.hpp" +#include "dpcpp/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_preconditioner { + + +/** + * @see reference/preconditioner/batch_identity.hpp + */ +template +class Identity final { +public: + using value_type = ValueType; + + static constexpr int work_size = 0; + + static int dynamic_work_size(int, int) { return 0; } + + template + void generate(size_type, const batch_item_type&, ValueType*, + sycl::nd_item<3> item_ct1) + {} + + __dpct_inline__ void apply(const int num_rows, const ValueType* const r, + ValueType* const z, + sycl::nd_item<3> item_ct1) const + { + for (int li = item_ct1.get_local_linear_id(); li < num_rows; + li += item_ct1.get_local_range().size()) { + z[li] = r[li]; + } + } +}; + + +} // namespace batch_preconditioner +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif diff --git a/dpcpp/preconditioner/batch_identity.hpp.inc b/dpcpp/preconditioner/batch_identity.hpp.inc deleted file mode 100644 index 4b5314363da..00000000000 --- a/dpcpp/preconditioner/batch_identity.hpp.inc +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -/** - * @see reference/preconditioner/batch_identity.hpp - */ -template -class Identity final { -public: - using value_type = ValueType; - - static constexpr int work_size = 0; - - static int dynamic_work_size(int, int) { return 0; } - - template - void generate(size_type, const batch_item_type&, ValueType*, - sycl::nd_item<3> item_ct1) - {} - - __dpct_inline__ void apply(const int num_rows, const ValueType* const r, - ValueType* const z, - sycl::nd_item<3> item_ct1) const - { - for (int li = item_ct1.get_local_linear_id(); li < num_rows; - li += item_ct1.get_local_range().size()) { - z[li] = r[li]; - } - } -}; diff --git a/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp b/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp index e66e7141a47..d85f93e74f2 100644 --- a/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp +++ b/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp @@ -16,6 +16,7 @@ #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/matrix/batch_struct.hpp" +#include "dpcpp/preconditioner/batch_jacobi_kernels.hpp" #include "dpcpp/preconditioner/jacobi_common.hpp" @@ -23,16 +24,12 @@ namespace gko { namespace kernels { namespace dpcpp { namespace batch_jacobi { - - namespace { using batch_jacobi_dpcpp_compiled_max_block_sizes = gko::kernels::dpcpp::jacobi::compiled_kernels; -#include "dpcpp/preconditioner/batch_jacobi_kernels.hpp.inc" - } // namespace @@ -96,15 +93,15 @@ void extract_common_blocks_pattern( const auto col_idxs = first_sys_csr->get_const_col_idxs(); exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(subgroup_size)]] { - extract_common_block_pattern_kernel( - static_cast(nrows), row_ptrs, - col_idxs, num_blocks, - cumulative_block_storage, block_pointers, - map_block_row, blocks_pattern, item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(subgroup_size)]] { + batch_single_kernels::extract_common_block_pattern_kernel( + static_cast(nrows), row_ptrs, col_idxs, num_blocks, + cumulative_block_storage, block_pointers, map_block_row, + blocks_pattern, item_ct1); + }); }); } @@ -142,15 +139,15 @@ void compute_block_jacobi_helper( dim3 grid(ceildiv(num_blocks * nbatch * subgroup_size, group_size)); exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(subgroup_size)]] { - compute_block_jacobi_kernel( - nbatch, static_cast(nnz), - sys_csr_values, num_blocks, - cumulative_block_storage, block_pointers, - blocks_pattern, blocks, item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(subgroup_size)]] { + batch_single_kernels::compute_block_jacobi_kernel( + nbatch, static_cast(nnz), sys_csr_values, + num_blocks, cumulative_block_storage, block_pointers, + blocks_pattern, blocks, item_ct1); + }); }); } diff --git a/dpcpp/preconditioner/batch_jacobi_kernels.hpp.inc b/dpcpp/preconditioner/batch_jacobi_kernels.hpp similarity index 87% rename from dpcpp/preconditioner/batch_jacobi_kernels.hpp.inc rename to dpcpp/preconditioner/batch_jacobi_kernels.hpp index 930850aaf1a..b8c75c9efa0 100644 --- a/dpcpp/preconditioner/batch_jacobi_kernels.hpp.inc +++ b/dpcpp/preconditioner/batch_jacobi_kernels.hpp @@ -2,6 +2,39 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_DPCPP_PRECONDITIONER_BATCH_JACOBI_KERNELS_HPP_ +#define GKO_DPCPP_PRECONDITIONER_BATCH_JACOBI_KERNELS_HPP_ + + +#include + +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/preconditioner/batch_jacobi_helpers.hpp" +#include "dpcpp/base/batch_multi_vector_kernels.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_csr_kernels.hpp" +#include "dpcpp/matrix/batch_dense_kernels.hpp" +#include "dpcpp/matrix/batch_ell_kernels.hpp" +#include "dpcpp/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_single_kernels { + + __dpct_inline__ void extract_common_block_pattern_kernel( const int nrows, const int* const __restrict__ sys_row_ptrs, const int* const __restrict__ sys_col_idxs, const size_type num_blocks, @@ -203,3 +236,12 @@ __dpct_inline__ void compute_block_jacobi_kernel( } } } + + +} // namespace batch_single_kernels +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif diff --git a/dpcpp/preconditioner/batch_preconditioners.hpp b/dpcpp/preconditioner/batch_preconditioners.hpp index 607cd7fa7bf..208e35b21b3 100644 --- a/dpcpp/preconditioner/batch_preconditioners.hpp +++ b/dpcpp/preconditioner/batch_preconditioners.hpp @@ -8,23 +8,9 @@ #include "core/matrix/batch_struct.hpp" #include "core/preconditioner/batch_jacobi_helpers.hpp" - - -namespace gko { -namespace kernels { -namespace dpcpp { -namespace batch_preconditioner { - - -#include "dpcpp/preconditioner/batch_block_jacobi.hpp.inc" -#include "dpcpp/preconditioner/batch_identity.hpp.inc" -#include "dpcpp/preconditioner/batch_scalar_jacobi.hpp.inc" - - -} // namespace batch_preconditioner -} // namespace dpcpp -} // namespace kernels -} // namespace gko +#include "dpcpp/preconditioner/batch_block_jacobi.hpp" +#include "dpcpp/preconditioner/batch_identity.hpp" +#include "dpcpp/preconditioner/batch_scalar_jacobi.hpp" #endif // GKO_DPCPP_PRECONDITIONER_BATCH_PRECONDITIONERS_HPP_ diff --git a/dpcpp/preconditioner/batch_scalar_jacobi.hpp.inc b/dpcpp/preconditioner/batch_scalar_jacobi.hpp similarity index 82% rename from dpcpp/preconditioner/batch_scalar_jacobi.hpp.inc rename to dpcpp/preconditioner/batch_scalar_jacobi.hpp index 3bb652a5032..c8963c7b592 100644 --- a/dpcpp/preconditioner/batch_scalar_jacobi.hpp.inc +++ b/dpcpp/preconditioner/batch_scalar_jacobi.hpp @@ -2,6 +2,39 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_DPCPP_PRECONDITIONER_BATCH_SCALAR_JACOBI_HPP_ +#define GKO_DPCPP_PRECONDITIONER_BATCH_SCALAR_JACOBI_HPP_ + + +#include + +#include + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/preconditioner/batch_jacobi_helpers.hpp" +#include "dpcpp/base/batch_multi_vector_kernels.hpp" +#include "dpcpp/base/batch_struct.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/matrix/batch_csr_kernels.hpp" +#include "dpcpp/matrix/batch_dense_kernels.hpp" +#include "dpcpp/matrix/batch_ell_kernels.hpp" +#include "dpcpp/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_preconditioner { + + /** * (Scalar) Jacobi preconditioner for batch solvers. */ @@ -134,3 +167,12 @@ class ScalarJacobi final { private: value_type* __restrict__ work_; }; + + +} // namespace batch_preconditioner +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif From bf6349965078d52da867371d29c8487c578ba344 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Sat, 24 Aug 2024 14:47:24 +0200 Subject: [PATCH 167/448] [cuda, hip] reorg batch_criteria --- ...ch_criteria.hpp.inc => batch_criteria.hpp} | 21 +++++++++++++++ cuda/stop/batch_criteria.cuh | 26 ------------------- hip/stop/batch_criteria.hip.hpp | 26 ------------------- 3 files changed, 21 insertions(+), 52 deletions(-) rename common/cuda_hip/stop/{batch_criteria.hpp.inc => batch_criteria.hpp} (77%) delete mode 100644 cuda/stop/batch_criteria.cuh delete mode 100644 hip/stop/batch_criteria.hip.hpp diff --git a/common/cuda_hip/stop/batch_criteria.hpp.inc b/common/cuda_hip/stop/batch_criteria.hpp similarity index 77% rename from common/cuda_hip/stop/batch_criteria.hpp.inc rename to common/cuda_hip/stop/batch_criteria.hpp index 38072467765..a7ae2005cc0 100644 --- a/common/cuda_hip/stop/batch_criteria.hpp.inc +++ b/common/cuda_hip/stop/batch_criteria.hpp @@ -2,6 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_ +#define GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_ + + +#include + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_stop { + + /** * @see reference/stop/batch_criteria.hpp */ @@ -49,3 +62,11 @@ class SimpleAbsResidual { private: const real_type abs_tol_; }; + + +} // namespace batch_stop +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + +#endif diff --git a/cuda/stop/batch_criteria.cuh b/cuda/stop/batch_criteria.cuh deleted file mode 100644 index f4f434dda11..00000000000 --- a/cuda/stop/batch_criteria.cuh +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_STOP_BATCH_CRITERIA_CUH_ -#define GKO_CUDA_STOP_BATCH_CRITERIA_CUH_ - - -#include - - -namespace gko { -namespace kernels { -namespace cuda { -namespace batch_stop { - - -#include "common/cuda_hip/stop/batch_criteria.hpp.inc" - - -} // namespace batch_stop -} // namespace cuda -} // namespace kernels -} // namespace gko - -#endif // GKO_CUDA_STOP_BATCH_CRITERIA_CUH_ diff --git a/hip/stop/batch_criteria.hip.hpp b/hip/stop/batch_criteria.hip.hpp deleted file mode 100644 index 1f721e36aaf..00000000000 --- a/hip/stop/batch_criteria.hip.hpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_ -#define GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_ - - -#include - - -namespace gko { -namespace kernels { -namespace hip { -namespace batch_stop { - - -#include "common/cuda_hip/stop/batch_criteria.hpp.inc" - - -} // namespace batch_stop -} // namespace hip -} // namespace kernels -} // namespace gko - -#endif // GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_ From 1f46611bc33d6805cbd920ae4a12b2cefb78ad53 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Sat, 24 Aug 2024 14:47:51 +0200 Subject: [PATCH 168/448] [cuda, hip] reorg batch_logger --- ...{batch_logger.hpp.inc => batch_logger.hpp} | 21 +++++++++++++++ core/solver/batch_dispatch.hpp | 8 +++--- cuda/log/batch_logger.cuh | 27 ------------------- hip/log/batch_logger.hip.hpp | 26 ------------------ 4 files changed, 25 insertions(+), 57 deletions(-) rename common/cuda_hip/log/{batch_logger.hpp.inc => batch_logger.hpp} (71%) delete mode 100644 cuda/log/batch_logger.cuh delete mode 100644 hip/log/batch_logger.hip.hpp diff --git a/common/cuda_hip/log/batch_logger.hpp.inc b/common/cuda_hip/log/batch_logger.hpp similarity index 71% rename from common/cuda_hip/log/batch_logger.hpp.inc rename to common/cuda_hip/log/batch_logger.hpp index 04b614b50f9..5e897b3c67d 100644 --- a/common/cuda_hip/log/batch_logger.hpp.inc +++ b/common/cuda_hip/log/batch_logger.hpp @@ -2,6 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_ +#define GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_ + + +#include + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_log { + /** * @see reference/log/batch_logger.hpp */ @@ -28,3 +40,12 @@ class SimpleFinalLogger final { real_type* const final_residuals_; idx_type* const final_iters_; }; + + +} // namespace batch_log +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp index 178f6b1beae..018a6674df5 100644 --- a/core/solver/batch_dispatch.hpp +++ b/core/solver/batch_dispatch.hpp @@ -25,10 +25,10 @@ #include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/log/batch_logger.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" #include "common/cuda_hip/preconditioner/batch_preconditioners.hpp" -#include "cuda/log/batch_logger.cuh" -#include "cuda/stop/batch_criteria.cuh" +#include "common/cuda_hip/stop/batch_criteria.hpp" namespace gko { @@ -52,10 +52,10 @@ using DeviceValueType = typename gko::kernels::cuda::cuda_type; #include "common/cuda_hip/base/batch_struct.hpp" +#include "common/cuda_hip/log/batch_logger.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" #include "common/cuda_hip/preconditioner/batch_preconditioners.hpp" -#include "hip/log/batch_logger.hip.hpp" -#include "hip/stop/batch_criteria.hip.hpp" +#include "common/cuda_hip/stop/batch_criteria.hpp" namespace gko { diff --git a/cuda/log/batch_logger.cuh b/cuda/log/batch_logger.cuh deleted file mode 100644 index 3e53d6ef0a6..00000000000 --- a/cuda/log/batch_logger.cuh +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_LOG_BATCH_LOGGER_CUH_ -#define GKO_CUDA_LOG_BATCH_LOGGER_CUH_ - - -#include - - -namespace gko { -namespace kernels { -namespace cuda { -namespace batch_log { - - -#include "common/cuda_hip/log/batch_logger.hpp.inc" - - -} // namespace batch_log -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_LOG_BATCH_LOGGER_CUH_ diff --git a/hip/log/batch_logger.hip.hpp b/hip/log/batch_logger.hip.hpp deleted file mode 100644 index a2540f2bd9d..00000000000 --- a/hip/log/batch_logger.hip.hpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_ -#define GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_ - - -#include - - -namespace gko { -namespace kernels { -namespace hip { -namespace batch_log { - -#include "common/cuda_hip/log/batch_logger.hpp.inc" - - -} // namespace batch_log -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_ From 3a397d16f0c4facc3c028f7e5612d5f55c5e5cb5 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 26 Aug 2024 10:51:39 +0200 Subject: [PATCH 169/448] [cuda, hip] remove unnecessary headers --- .../base/batch_multi_vector_kernels.hpp | 2 - common/cuda_hip/matrix/batch_csr_kernels.cpp | 3 -- common/cuda_hip/matrix/batch_csr_kernels.hpp | 7 --- .../cuda_hip/matrix/batch_dense_kernels.cpp | 3 -- .../cuda_hip/matrix/batch_dense_kernels.hpp | 7 --- common/cuda_hip/matrix/batch_ell_kernels.cpp | 3 -- common/cuda_hip/matrix/batch_ell_kernels.hpp | 8 --- .../preconditioner/batch_block_jacobi.hpp | 8 --- .../preconditioner/batch_identity.hpp | 13 ----- .../preconditioner/batch_jacobi_kernels.hpp | 8 --- .../preconditioner/batch_scalar_jacobi.hpp | 8 --- .../solver/batch_bicgstab_kernels.hpp | 50 +++++++------------ common/cuda_hip/solver/batch_cg_kernels.hpp | 35 +++++-------- cuda/solver/batch_cg_kernels.cu | 8 --- 14 files changed, 31 insertions(+), 132 deletions(-) diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp index 3f5763474c2..5c6210eeaed 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp @@ -21,9 +21,7 @@ #include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/format_conversion.hpp" #include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/segment_scan.hpp" #include "common/cuda_hip/components/thread_ids.hpp" #include "common/cuda_hip/components/warp_blas.hpp" diff --git a/common/cuda_hip/matrix/batch_csr_kernels.cpp b/common/cuda_hip/matrix/batch_csr_kernels.cpp index 35dc2c17e03..d48cdbaf32a 100644 --- a/common/cuda_hip/matrix/batch_csr_kernels.cpp +++ b/common/cuda_hip/matrix/batch_csr_kernels.cpp @@ -4,9 +4,6 @@ #include "common/cuda_hip/matrix/batch_csr_kernels.hpp" -#include -#include - #include #include #include diff --git a/common/cuda_hip/matrix/batch_csr_kernels.hpp b/common/cuda_hip/matrix/batch_csr_kernels.hpp index 5ed66c59d14..b1520f2d808 100644 --- a/common/cuda_hip/matrix/batch_csr_kernels.hpp +++ b/common/cuda_hip/matrix/batch_csr_kernels.hpp @@ -6,9 +6,6 @@ #define GKO_COMMON_CUDA_HIP_MATRIX_BATCH_CSR_KERNELS_HPP_ -#include -#include - #include #include #include @@ -22,11 +19,7 @@ #include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/format_conversion.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/segment_scan.hpp" #include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" diff --git a/common/cuda_hip/matrix/batch_dense_kernels.cpp b/common/cuda_hip/matrix/batch_dense_kernels.cpp index 44dad55aa70..ee4d87abaa3 100644 --- a/common/cuda_hip/matrix/batch_dense_kernels.cpp +++ b/common/cuda_hip/matrix/batch_dense_kernels.cpp @@ -4,9 +4,6 @@ #include "common/cuda_hip/matrix/batch_dense_kernels.hpp" -#include -#include - #include #include #include diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp b/common/cuda_hip/matrix/batch_dense_kernels.hpp index 7902d6010fa..c9089bd9a80 100644 --- a/common/cuda_hip/matrix/batch_dense_kernels.hpp +++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp @@ -6,9 +6,6 @@ #define GKO_COMMON_CUDA_HIP_MATRIX_BATCH_DENSE_KERNELS_HPP_ -#include -#include - #include #include #include @@ -19,12 +16,8 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/format_conversion.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/segment_scan.hpp" #include "common/cuda_hip/components/thread_ids.hpp" #include "common/cuda_hip/components/warp_blas.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" diff --git a/common/cuda_hip/matrix/batch_ell_kernels.cpp b/common/cuda_hip/matrix/batch_ell_kernels.cpp index c56325ab824..38d34707d45 100644 --- a/common/cuda_hip/matrix/batch_ell_kernels.cpp +++ b/common/cuda_hip/matrix/batch_ell_kernels.cpp @@ -4,9 +4,6 @@ #include "common/cuda_hip/matrix/batch_ell_kernels.hpp" -#include -#include - #include #include #include diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp b/common/cuda_hip/matrix/batch_ell_kernels.hpp index f32144dc172..a9037f5144a 100644 --- a/common/cuda_hip/matrix/batch_ell_kernels.hpp +++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp @@ -6,9 +6,6 @@ #define GKO_COMMON_CUDA_HIP_MATRIX_BATCH_ELL_KERNELS_HPP_ -#include -#include - #include #include #include @@ -19,14 +16,9 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/format_conversion.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/segment_scan.hpp" #include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" diff --git a/common/cuda_hip/preconditioner/batch_block_jacobi.hpp b/common/cuda_hip/preconditioner/batch_block_jacobi.hpp index 5aff975e960..c01bafa875a 100644 --- a/common/cuda_hip/preconditioner/batch_block_jacobi.hpp +++ b/common/cuda_hip/preconditioner/batch_block_jacobi.hpp @@ -6,9 +6,6 @@ #define GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_BLOCK_JACOBI_HPP_ -#include -#include - #include #include #include @@ -19,14 +16,9 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/format_conversion.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/segment_scan.hpp" #include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp" diff --git a/common/cuda_hip/preconditioner/batch_identity.hpp b/common/cuda_hip/preconditioner/batch_identity.hpp index 634d3212f36..3d57bcae406 100644 --- a/common/cuda_hip/preconditioner/batch_identity.hpp +++ b/common/cuda_hip/preconditioner/batch_identity.hpp @@ -6,9 +6,6 @@ #define GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_IDENTITY_HPP_ -#include -#include - #include #include #include @@ -16,16 +13,6 @@ #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/format_conversion.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/segment_scan.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp" diff --git a/common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp b/common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp index ac9143fefb9..9a1ea7458c8 100644 --- a/common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp +++ b/common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp @@ -6,9 +6,6 @@ #define GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_JACOBI_KERNELS_HPP_ -#include -#include - #include #include #include @@ -18,14 +15,9 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/format_conversion.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/segment_scan.hpp" #include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp" diff --git a/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp b/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp index 695d31235a8..42a4f3f6aa6 100644 --- a/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp +++ b/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp @@ -6,9 +6,6 @@ #define GKO_COMMON_CUDA_HIP_PRECONDITIONER_BATCH_SCALAR_JACOBI_HPP_ -#include -#include - #include #include #include @@ -19,14 +16,9 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/format_conversion.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/segment_scan.hpp" #include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp" diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp index cbab8ed6961..10d235358bc 100644 --- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp +++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp @@ -5,10 +5,6 @@ #ifndef GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ #define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ - -#include -#include - #include #include #include @@ -18,14 +14,9 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/format_conversion.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/segment_scan.hpp" #include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp" @@ -63,18 +54,15 @@ __device__ __forceinline__ void initialize( __syncthreads(); // r = b - A*x - batch_single_kernels::advanced_apply( - static_cast(-1.0), mat_entry, x_shared_entry, - static_cast(1.0), r_shared_entry); + advanced_apply(static_cast(-1.0), mat_entry, x_shared_entry, + static_cast(1.0), r_shared_entry); __syncthreads(); if (threadIdx.x / config::warp_size == 0) { - batch_single_kernels::single_rhs_compute_norm2( - subgroup, num_rows, r_shared_entry, res_norm); + single_rhs_compute_norm2(subgroup, num_rows, r_shared_entry, res_norm); } else if (threadIdx.x / config::warp_size == 1) { // Compute norms of rhs - batch_single_kernels::single_rhs_compute_norm2( - subgroup, num_rows, b_global_entry, rhs_norm); + single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, rhs_norm); } __syncthreads(); @@ -109,8 +97,8 @@ __device__ __forceinline__ void compute_alpha( const ValueType* const v_shared_entry, ValueType& alpha) { if (threadIdx.x / config::warp_size == 0) { - batch_single_kernels::single_rhs_compute_conj_dot( - subgroup, num_rows, r_hat_shared_entry, v_shared_entry, alpha); + single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_shared_entry, + v_shared_entry, alpha); } __syncthreads(); if (threadIdx.x == 0) { @@ -138,11 +126,11 @@ __device__ __forceinline__ void compute_omega( const ValueType* const s_shared_entry, ValueType& temp, ValueType& omega) { if (threadIdx.x / config::warp_size == 0) { - batch_single_kernels::single_rhs_compute_conj_dot( - subgroup, num_rows, t_shared_entry, s_shared_entry, omega); + single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry, + s_shared_entry, omega); } else if (threadIdx.x / config::warp_size == 1) { - batch_single_kernels::single_rhs_compute_conj_dot( - subgroup, num_rows, t_shared_entry, t_shared_entry, temp); + single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry, + t_shared_entry, temp); } __syncthreads(); @@ -310,8 +298,8 @@ __global__ void apply_kernel( // rho_new = < r_hat , r > = (r_hat)' * (r) if (threadIdx.x / config::warp_size == 0) { - batch_single_kernels::single_rhs_compute_conj_dot( - subgroup, num_rows, r_hat_sh, r_sh, rho_new_sh[0]); + single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_sh, r_sh, + rho_new_sh[0]); } __syncthreads(); @@ -326,7 +314,7 @@ __global__ void apply_kernel( __syncthreads(); // v = A * p_hat - batch_single_kernels::simple_apply(mat_entry, p_hat_sh, v_sh); + simple_apply(mat_entry, p_hat_sh, v_sh); __syncthreads(); // alpha = rho_new / < r_hat , v> @@ -340,8 +328,8 @@ __global__ void apply_kernel( // an estimate of residual norms if (threadIdx.x / config::warp_size == 0) { - batch_single_kernels::single_rhs_compute_norm2( - subgroup, num_rows, s_sh, norms_res_sh[0]); + single_rhs_compute_norm2(subgroup, num_rows, s_sh, + norms_res_sh[0]); } __syncthreads(); @@ -357,7 +345,7 @@ __global__ void apply_kernel( __syncthreads(); // t = A * s_hat - batch_single_kernels::simple_apply(mat_entry, s_hat_sh, t_sh); + simple_apply(mat_entry, s_hat_sh, t_sh); __syncthreads(); // omega = / @@ -372,8 +360,8 @@ __global__ void apply_kernel( __syncthreads(); if (threadIdx.x / config::warp_size == 0) { - batch_single_kernels::single_rhs_compute_norm2( - subgroup, num_rows, r_sh, norms_res_sh[0]); + single_rhs_compute_norm2(subgroup, num_rows, r_sh, + norms_res_sh[0]); } //__syncthreads(); @@ -386,7 +374,7 @@ __global__ void apply_kernel( logger.log_iteration(batch_id, iter, norms_res_sh[0]); // copy x back to global memory - batch_single_kernels::single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr); + single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr); __syncthreads(); } } diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp b/common/cuda_hip/solver/batch_cg_kernels.hpp index e7ec0505844..7ccdc5f9926 100644 --- a/common/cuda_hip/solver/batch_cg_kernels.hpp +++ b/common/cuda_hip/solver/batch_cg_kernels.hpp @@ -6,9 +6,6 @@ #define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_CG_KERNELS_HPP_ -#include -#include - #include #include #include @@ -18,14 +15,9 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/format_conversion.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/segment_scan.hpp" #include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp" @@ -58,9 +50,8 @@ __device__ __forceinline__ void initialize( __syncthreads(); // r = b - A*x - batch_single_kernels::advanced_apply( - static_cast(-1.0), mat_entry, x_shared_entry, - static_cast(1.0), r_shared_entry); + advanced_apply(static_cast(-1.0), mat_entry, x_shared_entry, + static_cast(1.0), r_shared_entry); __syncthreads(); // z = precond * r @@ -69,13 +60,12 @@ __device__ __forceinline__ void initialize( if (threadIdx.x / config::warp_size == 0) { // Compute norms of rhs - batch_single_kernels::single_rhs_compute_norm2( - subgroup, num_rows, b_global_entry, rhs_norms_sh); + single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, + rhs_norms_sh); } else if (threadIdx.x / config::warp_size == 1) { // rho_old = r' * z - batch_single_kernels::single_rhs_compute_conj_dot( - subgroup, num_rows, r_shared_entry, z_shared_entry, - rho_old_shared_entry); + single_rhs_compute_conj_dot(subgroup, num_rows, r_shared_entry, + z_shared_entry, rho_old_shared_entry); } // p = z @@ -107,9 +97,8 @@ __device__ __forceinline__ void update_x_and_r( ValueType* const x_shared_entry, ValueType* const r_shared_entry) { if (threadIdx.x / config::warp_size == 0) { - batch_single_kernels::single_rhs_compute_conj_dot( - subgroup, num_rows, p_shared_entry, Ap_shared_entry, - alpha_shared_entry); + single_rhs_compute_conj_dot(subgroup, num_rows, p_shared_entry, + Ap_shared_entry, alpha_shared_entry); } __syncthreads(); @@ -225,7 +214,7 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf, } // Ap = A * p - batch_single_kernels::simple_apply(mat_entry, p_sh, Ap_sh); + simple_apply(mat_entry, p_sh, Ap_sh); __syncthreads(); // alpha = rho_old / (p' * Ap) @@ -241,8 +230,8 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf, if (threadIdx.x / config::warp_size == 0) { // rho_new = (r)' * (z) - batch_single_kernels::single_rhs_compute_conj_dot( - subgroup, num_rows, r_sh, z_sh, rho_new_sh[0]); + single_rhs_compute_conj_dot(subgroup, num_rows, r_sh, z_sh, + rho_new_sh[0]); } __syncthreads(); @@ -261,7 +250,7 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf, logger.log_iteration(batch_id, iter, norms_res_sh[0]); // copy x back to global memory - batch_single_kernels::single_rhs_copy(num_rows, x_sh, x_global_entry); + single_rhs_copy(num_rows, x_sh, x_global_entry); __syncthreads(); } } diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index b8ead675a3c..3f7dac1d08a 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -4,21 +4,13 @@ #include "core/solver/batch_cg_kernels.hpp" -#include -#include - #include #include #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp" From dcb72c3c54f766e7bc06a8d01b9e5e0ca81709a5 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 26 Aug 2024 11:27:13 +0200 Subject: [PATCH 170/448] [dpcpp] rem headers and namespaces --- .../base/batch_multi_vector_kernels.hpp | 4 -- dpcpp/base/batch_multi_vector_kernels.hpp | 1 - dpcpp/matrix/batch_csr_kernels.hpp | 2 - dpcpp/matrix/batch_dense_kernels.hpp | 2 - dpcpp/matrix/batch_ell_kernels.hpp | 2 - dpcpp/preconditioner/batch_block_jacobi.hpp | 6 -- dpcpp/preconditioner/batch_identity.hpp | 6 -- dpcpp/preconditioner/batch_jacobi_kernels.hpp | 1 - dpcpp/preconditioner/batch_scalar_jacobi.hpp | 6 -- dpcpp/solver/batch_bicgstab_kernels.hpp | 56 ++++++++----------- dpcpp/solver/batch_cg_kernels.hpp | 30 +++++----- 11 files changed, 36 insertions(+), 80 deletions(-) diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp index 5c6210eeaed..63836280544 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp @@ -6,9 +6,6 @@ #define GKO_COMMON_CUDA_HIP_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_ -#include -#include - #include #include #include @@ -18,7 +15,6 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/cuda_hip/components/reduction.hpp" diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp b/dpcpp/base/batch_multi_vector_kernels.hpp index 142eba259de..74abaeda86f 100644 --- a/dpcpp/base/batch_multi_vector_kernels.hpp +++ b/dpcpp/base/batch_multi_vector_kernels.hpp @@ -17,7 +17,6 @@ #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" -#include "dpcpp/components/intrinsics.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" diff --git a/dpcpp/matrix/batch_csr_kernels.hpp b/dpcpp/matrix/batch_csr_kernels.hpp index 2b195de308b..37dc5a2c52c 100644 --- a/dpcpp/matrix/batch_csr_kernels.hpp +++ b/dpcpp/matrix/batch_csr_kernels.hpp @@ -18,8 +18,6 @@ #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" -#include "dpcpp/components/intrinsics.dp.hpp" -#include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" #include "dpcpp/matrix/batch_struct.hpp" diff --git a/dpcpp/matrix/batch_dense_kernels.hpp b/dpcpp/matrix/batch_dense_kernels.hpp index 59aee9a7208..a8f741bc3d0 100644 --- a/dpcpp/matrix/batch_dense_kernels.hpp +++ b/dpcpp/matrix/batch_dense_kernels.hpp @@ -18,8 +18,6 @@ #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" -#include "dpcpp/components/intrinsics.dp.hpp" -#include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" #include "dpcpp/matrix/batch_struct.hpp" diff --git a/dpcpp/matrix/batch_ell_kernels.hpp b/dpcpp/matrix/batch_ell_kernels.hpp index 5a1ba163216..fb6bd3d8121 100644 --- a/dpcpp/matrix/batch_ell_kernels.hpp +++ b/dpcpp/matrix/batch_ell_kernels.hpp @@ -18,8 +18,6 @@ #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" -#include "dpcpp/components/intrinsics.dp.hpp" -#include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" #include "dpcpp/matrix/batch_struct.hpp" diff --git a/dpcpp/preconditioner/batch_block_jacobi.hpp b/dpcpp/preconditioner/batch_block_jacobi.hpp index b01de33c299..a7431f919a5 100644 --- a/dpcpp/preconditioner/batch_block_jacobi.hpp +++ b/dpcpp/preconditioner/batch_block_jacobi.hpp @@ -13,19 +13,13 @@ #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/preconditioner/batch_jacobi_helpers.hpp" -#include "dpcpp/base/batch_multi_vector_kernels.hpp" #include "dpcpp/base/batch_struct.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" -#include "dpcpp/components/intrinsics.dp.hpp" -#include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" -#include "dpcpp/matrix/batch_csr_kernels.hpp" -#include "dpcpp/matrix/batch_dense_kernels.hpp" -#include "dpcpp/matrix/batch_ell_kernels.hpp" #include "dpcpp/matrix/batch_struct.hpp" diff --git a/dpcpp/preconditioner/batch_identity.hpp b/dpcpp/preconditioner/batch_identity.hpp index 0696d028059..5d6a1cfcb65 100644 --- a/dpcpp/preconditioner/batch_identity.hpp +++ b/dpcpp/preconditioner/batch_identity.hpp @@ -12,19 +12,13 @@ #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" -#include "dpcpp/base/batch_multi_vector_kernels.hpp" #include "dpcpp/base/batch_struct.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" -#include "dpcpp/components/intrinsics.dp.hpp" -#include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" -#include "dpcpp/matrix/batch_csr_kernels.hpp" -#include "dpcpp/matrix/batch_dense_kernels.hpp" -#include "dpcpp/matrix/batch_ell_kernels.hpp" #include "dpcpp/matrix/batch_struct.hpp" diff --git a/dpcpp/preconditioner/batch_jacobi_kernels.hpp b/dpcpp/preconditioner/batch_jacobi_kernels.hpp index b8c75c9efa0..769ebc47a57 100644 --- a/dpcpp/preconditioner/batch_jacobi_kernels.hpp +++ b/dpcpp/preconditioner/batch_jacobi_kernels.hpp @@ -20,7 +20,6 @@ #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" -#include "dpcpp/components/intrinsics.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" #include "dpcpp/matrix/batch_csr_kernels.hpp" diff --git a/dpcpp/preconditioner/batch_scalar_jacobi.hpp b/dpcpp/preconditioner/batch_scalar_jacobi.hpp index c8963c7b592..e48188c32c2 100644 --- a/dpcpp/preconditioner/batch_scalar_jacobi.hpp +++ b/dpcpp/preconditioner/batch_scalar_jacobi.hpp @@ -13,19 +13,13 @@ #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/preconditioner/batch_jacobi_helpers.hpp" -#include "dpcpp/base/batch_multi_vector_kernels.hpp" #include "dpcpp/base/batch_struct.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" -#include "dpcpp/components/intrinsics.dp.hpp" -#include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" -#include "dpcpp/matrix/batch_csr_kernels.hpp" -#include "dpcpp/matrix/batch_dense_kernels.hpp" -#include "dpcpp/matrix/batch_ell_kernels.hpp" #include "dpcpp/matrix/batch_struct.hpp" diff --git a/dpcpp/solver/batch_bicgstab_kernels.hpp b/dpcpp/solver/batch_bicgstab_kernels.hpp index a6db9e7470a..c670725503e 100644 --- a/dpcpp/solver/batch_bicgstab_kernels.hpp +++ b/dpcpp/solver/batch_bicgstab_kernels.hpp @@ -19,7 +19,6 @@ #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" -#include "dpcpp/components/intrinsics.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" #include "dpcpp/matrix/batch_csr_kernels.hpp" @@ -65,19 +64,17 @@ __dpct_inline__ void initialize( item_ct1.barrier(sycl::access::fence_space::global_and_local); // r = b - A*x - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply( - static_cast(-1.0), mat_global_entry, x_shared_entry, - static_cast(1.0), r_shared_entry, item_ct1); + advanced_apply(static_cast(-1.0), mat_global_entry, + x_shared_entry, static_cast(1.0), r_shared_entry, + item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); if (sg_id == 0) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_norm2_sg(num_rows, r_shared_entry, res_norm, - item_ct1); + single_rhs_compute_norm2_sg(num_rows, r_shared_entry, res_norm, + item_ct1); } else if (sg_id == 1) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norm, - item_ct1); + single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norm, + item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); @@ -120,9 +117,8 @@ __dpct_inline__ void compute_alpha(const int num_rows, const ValueType& rho_new, const auto sg_id = sg.get_group_id(); const auto tid = item_ct1.get_local_linear_id(); if (sg_id == 0) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_conj_dot_sg(num_rows, r_hat_shared_entry, - v_shared_entry, alpha, item_ct1); + single_rhs_compute_conj_dot_sg(num_rows, r_hat_shared_entry, + v_shared_entry, alpha, item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); if (tid == 0) { @@ -158,13 +154,11 @@ __dpct_inline__ void compute_omega(const int num_rows, const auto sg_id = sg.get_group_id(); const auto tid = item_ct1.get_local_linear_id(); if (sg_id == 0) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry, - s_shared_entry, omega, item_ct1); + single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry, s_shared_entry, + omega, item_ct1); } else if (sg_id == 1) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry, - t_shared_entry, temp, item_ct1); + single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry, t_shared_entry, + temp, item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); if (tid == 0) { @@ -345,9 +339,8 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf, // rho_new = < r_hat , r > = (r_hat)' * (r) if (sg_id == 0) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_conj_dot_sg(num_rows, r_hat_sh, r_sh, - rho_new_sh[0], item_ct1); + single_rhs_compute_conj_dot_sg(num_rows, r_hat_sh, r_sh, + rho_new_sh[0], item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); @@ -362,8 +355,7 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf, item_ct1.barrier(sycl::access::fence_space::global_and_local); // v = A * p_hat - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply( - mat_global_entry, p_hat_sh, v_sh, item_ct1); + simple_apply(mat_global_entry, p_hat_sh, v_sh, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); // alpha = rho_new / < r_hat , v> @@ -377,9 +369,8 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf, // an estimate of residual norms if (sg_id == 0) { - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_norm2_sg(num_rows, s_sh, norms_res_sh[0], - item_ct1); + single_rhs_compute_norm2_sg(num_rows, s_sh, norms_res_sh[0], + item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); @@ -394,8 +385,7 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf, item_ct1.barrier(sycl::access::fence_space::global_and_local); // t = A * s_hat - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply( - mat_global_entry, s_hat_sh, t_sh, item_ct1); + simple_apply(mat_global_entry, s_hat_sh, t_sh, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); // omega = / @@ -409,9 +399,8 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf, item_ct1.barrier(sycl::access::fence_space::global_and_local); if (sg_id == 0) - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: - single_rhs_compute_norm2_sg(num_rows, r_sh, norms_res_sh[0], - item_ct1); + single_rhs_compute_norm2_sg(num_rows, r_sh, norms_res_sh[0], + item_ct1); if (tid == group_size - 1) { rho_old_sh[0] = rho_new_sh[0]; } @@ -421,8 +410,7 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf, logger.log_iteration(batch_id, iter, norms_res_sh[0]); // copy x back to global memory - gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel( - num_rows, x_sh, x_global_entry, item_ct1); + copy_kernel(num_rows, x_sh, x_global_entry, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); } diff --git a/dpcpp/solver/batch_cg_kernels.hpp b/dpcpp/solver/batch_cg_kernels.hpp index 67df0a17236..1619e64aa2f 100644 --- a/dpcpp/solver/batch_cg_kernels.hpp +++ b/dpcpp/solver/batch_cg_kernels.hpp @@ -19,7 +19,6 @@ #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" -#include "dpcpp/components/intrinsics.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" #include "dpcpp/matrix/batch_csr_kernels.hpp" @@ -59,9 +58,9 @@ __dpct_inline__ void initialize( item_ct1.barrier(sycl::access::fence_space::global_and_local); // r = b - A*x - batch_single_kernels::advanced_apply( - static_cast(-1.0), mat_global_entry, x_shared_entry, - static_cast(1.0), r_shared_entry, item_ct1); + advanced_apply(static_cast(-1.0), mat_global_entry, + x_shared_entry, static_cast(1.0), r_shared_entry, + item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); @@ -72,11 +71,11 @@ __dpct_inline__ void initialize( // Compute norms of rhs // and rho_old = r' * z if (sg_id == 0) { - batch_single_kernels::single_rhs_compute_norm2_sg( - num_rows, b_global_entry, rhs_norms, item_ct1); + single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norms, + item_ct1); } else if (sg_id == 1) { - batch_single_kernels::single_rhs_compute_conj_dot_sg( - num_rows, r_shared_entry, z_shared_entry, rho_old, item_ct1); + single_rhs_compute_conj_dot_sg(num_rows, r_shared_entry, z_shared_entry, + rho_old, item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); @@ -112,9 +111,9 @@ __dpct_inline__ void update_x_and_r( auto sg = item_ct1.get_sub_group(); const auto tid = item_ct1.get_local_linear_id(); if (sg.get_group_id() == 0) { - batch_single_kernels::single_rhs_compute_conj_dot_sg( - num_rows, p_shared_entry, Ap_shared_entry, alpha_shared_entry, - item_ct1); + single_rhs_compute_conj_dot_sg(num_rows, p_shared_entry, + Ap_shared_entry, alpha_shared_entry, + item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); if (tid == 0) { @@ -236,8 +235,7 @@ __dpct_inline__ void apply_kernel( break; } // Ap = A * p - batch_single_kernels::simple_apply(mat_global_entry, p_sh, Ap_sh, - item_ct1); + simple_apply(mat_global_entry, p_sh, Ap_sh, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); // alpha = rho_old / (p' * Ap) @@ -254,8 +252,8 @@ __dpct_inline__ void apply_kernel( // rho_new = (r)' * (z) if (sg_id == 0) { - batch_single_kernels::single_rhs_compute_conj_dot_sg( - num_rows, r_sh, z_sh, rho_new_sh[0], item_ct1); + single_rhs_compute_conj_dot_sg(num_rows, r_sh, z_sh, rho_new_sh[0], + item_ct1); } item_ct1.barrier(sycl::access::fence_space::global_and_local); @@ -272,7 +270,7 @@ __dpct_inline__ void apply_kernel( logger.log_iteration(batch_id, iter, norms_res_sh[0]); // copy x back to global memory - batch_single_kernels::copy_kernel(num_rows, x_sh, x_global_entry, item_ct1); + copy_kernel(num_rows, x_sh, x_global_entry, item_ct1); item_ct1.barrier(sycl::access::fence_space::global_and_local); } From 212d2c4bb0116cb8091f6cbb529ff1eefbbd71d5 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 26 Aug 2024 12:59:02 +0200 Subject: [PATCH 171/448] fixup! [cuda, hip] remove unnecessary headers --- .../cuda_hip/preconditioner/batch_block_jacobi.hpp | 6 ------ common/cuda_hip/preconditioner/batch_identity.hpp | 4 ---- .../cuda_hip/preconditioner/batch_scalar_jacobi.hpp | 6 ------ common/cuda_hip/solver/batch_bicgstab_kernels.hpp | 1 - cuda/solver/batch_bicgstab_kernels.cu | 13 ------------- cuda/solver/batch_cg_kernels.cu | 4 ---- hip/solver/batch_bicgstab_kernels.hip.cpp | 11 ----------- hip/solver/batch_cg_kernels.hip.cpp | 12 ------------ 8 files changed, 57 deletions(-) diff --git a/common/cuda_hip/preconditioner/batch_block_jacobi.hpp b/common/cuda_hip/preconditioner/batch_block_jacobi.hpp index c01bafa875a..604989dfa6d 100644 --- a/common/cuda_hip/preconditioner/batch_block_jacobi.hpp +++ b/common/cuda_hip/preconditioner/batch_block_jacobi.hpp @@ -11,17 +11,11 @@ #include #include -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" #include "core/preconditioner/batch_jacobi_helpers.hpp" diff --git a/common/cuda_hip/preconditioner/batch_identity.hpp b/common/cuda_hip/preconditioner/batch_identity.hpp index 3d57bcae406..3fa6693c7ef 100644 --- a/common/cuda_hip/preconditioner/batch_identity.hpp +++ b/common/cuda_hip/preconditioner/batch_identity.hpp @@ -10,12 +10,8 @@ #include #include -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" diff --git a/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp b/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp index 42a4f3f6aa6..5cd8c28a1d0 100644 --- a/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp +++ b/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp @@ -11,17 +11,11 @@ #include #include -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp index 10d235358bc..8ea31358ed5 100644 --- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp +++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp @@ -13,7 +13,6 @@ #include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/cuda_hip/components/thread_ids.hpp" diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index d3dc8712201..8a5eee6b196 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -4,25 +4,12 @@ #include "core/solver/batch_bicgstab_kernels.hpp" -#include -#include - #include #include -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp" #include "core/base/batch_struct.hpp" diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index 3f7dac1d08a..32e66d7ee54 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -7,13 +7,9 @@ #include #include -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" #include "common/cuda_hip/solver/batch_cg_kernels.hpp" #include "core/base/batch_struct.hpp" diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index d44bc4a0eb6..17199d2cd19 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -4,9 +4,6 @@ #include "core/solver/batch_bicgstab_kernels.hpp" -#include -#include - #include #include @@ -15,15 +12,7 @@ #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp" #include "core/base/batch_struct.hpp" diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index c9a1e81be81..6d5e3bff3b3 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -4,26 +4,14 @@ #include "core/solver/batch_cg_kernels.hpp" -#include -#include - #include #include -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/base/batch_struct.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp" #include "common/cuda_hip/matrix/batch_struct.hpp" #include "common/cuda_hip/solver/batch_cg_kernels.hpp" #include "core/base/batch_struct.hpp" From eae2cab8686f298e4c1e343aca502103c828fa2c Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 31 Jul 2024 10:10:00 +0200 Subject: [PATCH 172/448] [core] allow naming lambda operations --- include/ginkgo/core/base/executor.hpp | 29 +++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 95373b3e847..8afac213303 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -659,6 +659,17 @@ class Executor : public log::EnableLogging { this->run(op); } + template + void run(std::string name, const ClosureOmp& op_omp, + const ClosureCuda& op_cuda, const ClosureHip& op_hip, + const ClosureDpcpp& op_dpcpp) const + { + LambdaOperation op( + std::move(name), op_omp, op_cuda, op_hip, op_dpcpp); + this->run(op); + } + /** * Allocates memory in this Executor. * @@ -1109,6 +1120,16 @@ class Executor : public log::EnableLogging { typename ClosureDpcpp> class LambdaOperation : public Operation { public: + LambdaOperation(std::string name, const ClosureOmp& op_omp, + const ClosureCuda& op_cuda, const ClosureHip& op_hip, + const ClosureDpcpp& op_dpcpp) + : name_(std::move(name)), + op_omp_(op_omp), + op_cuda_(op_cuda), + op_hip_(op_hip), + op_dpcpp_(op_dpcpp) + {} + /** * Creates an LambdaOperation object from four functors. * @@ -1121,10 +1142,7 @@ class Executor : public log::EnableLogging { */ LambdaOperation(const ClosureOmp& op_omp, const ClosureCuda& op_cuda, const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) - : op_omp_(op_omp), - op_cuda_(op_cuda), - op_hip_(op_hip), - op_dpcpp_(op_dpcpp) + : LambdaOperation("unnamed", op_omp, op_cuda, op_hip, op_dpcpp) {} void run(std::shared_ptr) const override @@ -1152,7 +1170,10 @@ class Executor : public log::EnableLogging { op_dpcpp_(); } + const char* get_name() const noexcept override { return name_.c_str(); } + private: + std::string name_; ClosureOmp op_omp_; ClosureCuda op_cuda_; ClosureHip op_hip_; From b9ee8ae4a03b894b3d7503b4d6ec916af02ef109 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 15 Aug 2024 11:42:34 +0200 Subject: [PATCH 173/448] [core] make run(lambda operation) available on all executors --- include/ginkgo/core/base/executor.hpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 8afac213303..0e338f42044 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -1251,8 +1251,6 @@ class ExecutorBase : public Executor { friend class ReferenceExecutor; public: - using Executor::run; - void run(const Operation& op) const override { this->template log(this, &op); @@ -1362,6 +1360,8 @@ class OmpExecutor : public detail::ExecutorBase, friend class detail::ExecutorBase; public: + using Executor::run; + /** * Creates a new OmpExecutor. */ @@ -1439,6 +1439,8 @@ using DefaultExecutor = OmpExecutor; */ class ReferenceExecutor : public OmpExecutor { public: + using Executor::run; + static std::shared_ptr create( std::shared_ptr alloc = std::make_shared()) @@ -1513,6 +1515,8 @@ class CudaExecutor : public detail::ExecutorBase, friend class detail::ExecutorBase; public: + using Executor::run; + /** * Creates a new CudaExecutor. * @@ -1748,6 +1752,8 @@ class HipExecutor : public detail::ExecutorBase, friend class detail::ExecutorBase; public: + using Executor::run; + /** * Creates a new HipExecutor. * @@ -1963,6 +1969,8 @@ class DpcppExecutor : public detail::ExecutorBase, friend class detail::ExecutorBase; public: + using Executor::run; + /** * Creates a new DpcppExecutor. * From 2efc482956a4e86fd7aba1cfa08591cd347867fd Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 15 Aug 2024 11:51:08 +0200 Subject: [PATCH 174/448] [core] add tests for lambda op name --- core/test/base/executor.cpp | 38 +++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp index 64a11929983..20f795b2ded 100644 --- a/core/test/base/executor.cpp +++ b/core/test/base/executor.cpp @@ -521,4 +521,42 @@ TEST_F(ExecutorLogging, LogsOperation) } +struct NameLogger : public gko::log::Logger { +protected: + void on_operation_launched(const gko::Executor* exec, + const gko::Operation* op) const override + { + op_name = op->get_name(); + } + +public: + mutable std::string op_name; +}; + + +TEST(LambdaOperation, CanSetName) +{ + auto name_logger = std::make_shared(); + auto exec = gko::ReferenceExecutor::create(); + exec->add_logger(name_logger); + + exec->run( + "name", [] {}, [] {}, [] {}, [] {}); + + ASSERT_EQ("name", name_logger->op_name); +} + + +TEST(LambdaOperation, HasDefaultName) +{ + auto name_logger = std::make_shared(); + auto exec = gko::ReferenceExecutor::create(); + exec->add_logger(name_logger); + + exec->run([] {}, [] {}, [] {}, [] {}); + + ASSERT_EQ("unname", name_logger->op_name); +} + + } // namespace From 900653d8bf8ff3e47de57db39ef1fcc48c03ac8b Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 19 Aug 2024 11:38:19 +0200 Subject: [PATCH 175/448] [core] review updates: - test only for existence of default name - add closure for reference op - deprecate lambda run without name Co-authored-by: Pratik Nayak Co-authored-by: Tobias Ribizel --- core/test/base/executor.cpp | 10 ++++- include/ginkgo/core/base/executor.hpp | 55 +++++++++++++++++++-------- test/base/executor.cpp | 22 +++++++++++ 3 files changed, 69 insertions(+), 18 deletions(-) diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp index 20f795b2ded..ae037e075df 100644 --- a/core/test/base/executor.cpp +++ b/core/test/base/executor.cpp @@ -541,12 +541,15 @@ TEST(LambdaOperation, CanSetName) exec->add_logger(name_logger); exec->run( - "name", [] {}, [] {}, [] {}, [] {}); + "name", [] {}, [] {}, [] {}, [] {}, [] {}); ASSERT_EQ("name", name_logger->op_name); } +GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS + + TEST(LambdaOperation, HasDefaultName) { auto name_logger = std::make_shared(); @@ -555,8 +558,11 @@ TEST(LambdaOperation, HasDefaultName) exec->run([] {}, [] {}, [] {}, [] {}); - ASSERT_EQ("unname", name_logger->op_name); + ASSERT_NE(nullptr, name_logger->op_name.c_str()); } +GKO_END_DISABLE_DEPRECATION_WARNINGS + + } // namespace diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 0e338f42044..963e30bfddd 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -651,22 +651,42 @@ class Executor : public log::EnableLogging { */ template + GKO_DEPRECATED( + "Please use the overload with std::string as first parameter.") void run(const ClosureOmp& op_omp, const ClosureCuda& op_cuda, const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const { - LambdaOperation op( - op_omp, op_cuda, op_hip, op_dpcpp); + LambdaOperation + op(op_omp, op_cuda, op_hip, op_dpcpp); this->run(op); } - template - void run(std::string name, const ClosureOmp& op_omp, - const ClosureCuda& op_cuda, const ClosureHip& op_hip, - const ClosureDpcpp& op_dpcpp) const + /** + * Runs one of the passed in functors, depending on the Executor type. + * + * @tparam ClosureReference type of op_ref + * @tparam ClosureOmp type of op_omp + * @tparam ClosureCuda type of op_cuda + * @tparam ClosureHip type of op_hip + * @tparam ClosureDpcpp type of op_dpcpp + * + * @param name the name of the operation + * @param op_ref functor to run in case of a ReferenceExecutor + * @param op_omp functor to run in case of a OmpExecutor + * @param op_cuda functor to run in case of a CudaExecutor + * @param op_hip functor to run in case of a HipExecutor + * @param op_dpcpp functor to run in case of a DpcppExecutor + */ + template + void run(std::string name, const ClosureReference& op_ref, + const ClosureOmp& op_omp, const ClosureCuda& op_cuda, + const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const { - LambdaOperation op( - std::move(name), op_omp, op_cuda, op_hip, op_dpcpp); + LambdaOperation + op(std::move(name), op_ref, op_omp, op_cuda, op_hip, op_dpcpp); this->run(op); } @@ -1116,14 +1136,15 @@ class Executor : public log::EnableLogging { * @tparam ClosureHip the type of the third functor * @tparam ClosureDpcpp the type of the fourth functor */ - template + template class LambdaOperation : public Operation { public: - LambdaOperation(std::string name, const ClosureOmp& op_omp, - const ClosureCuda& op_cuda, const ClosureHip& op_hip, - const ClosureDpcpp& op_dpcpp) + LambdaOperation(std::string name, const ClosureReference& op_ref, + const ClosureOmp& op_omp, const ClosureCuda& op_cuda, + const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) : name_(std::move(name)), + op_ref_(op_ref), op_omp_(op_omp), op_cuda_(op_cuda), op_hip_(op_hip), @@ -1142,7 +1163,8 @@ class Executor : public log::EnableLogging { */ LambdaOperation(const ClosureOmp& op_omp, const ClosureCuda& op_cuda, const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) - : LambdaOperation("unnamed", op_omp, op_cuda, op_hip, op_dpcpp) + : LambdaOperation("unnamed", op_omp, op_omp, op_cuda, op_hip, + op_dpcpp) {} void run(std::shared_ptr) const override @@ -1152,7 +1174,7 @@ class Executor : public log::EnableLogging { void run(std::shared_ptr) const override { - op_omp_(); + op_ref_(); } void run(std::shared_ptr) const override @@ -1174,6 +1196,7 @@ class Executor : public log::EnableLogging { private: std::string name_; + ClosureReference op_ref_; ClosureOmp op_omp_; ClosureCuda op_cuda_; ClosureHip op_hip_; diff --git a/test/base/executor.cpp b/test/base/executor.cpp index 8a344eb224d..7fcab4e0784 100644 --- a/test/base/executor.cpp +++ b/test/base/executor.cpp @@ -90,9 +90,28 @@ TEST_F(Executor, RunsCorrectHostOperation) } +TEST_F(Executor, RunsCorrectLambdaOperationWithReferenceExecutor) +{ + int value = 0; + auto ref_lambda = [&value]() { value = reference::value; }; + auto omp_lambda = [&value]() { value = omp::value; }; + auto cuda_lambda = [&value]() { value = cuda::value; }; + auto hip_lambda = [&value]() { value = hip::value; }; + auto dpcpp_lambda = [&value]() { value = dpcpp::value; }; + + exec->run("test", ref_lambda, omp_lambda, cuda_lambda, hip_lambda, + dpcpp_lambda); + + ASSERT_EQ(GKO_DEVICE_NAMESPACE::value, value); +} + + #ifndef GKO_COMPILING_REFERENCE +GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS + + TEST_F(Executor, RunsCorrectLambdaOperation) { int value = 0; @@ -107,4 +126,7 @@ TEST_F(Executor, RunsCorrectLambdaOperation) } +GKO_END_DISABLE_DEPRECATION_WARNINGS + + #endif // GKO_COMPILING_REFERENCE From 09d6704ba62b40ec8dbfa8d882eafd6c48b8443a Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 9 Sep 2024 16:42:57 +0200 Subject: [PATCH 176/448] add the test to detect the problem --- test/solver/bicgstab_kernels.cpp | 42 ++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/test/solver/bicgstab_kernels.cpp b/test/solver/bicgstab_kernels.cpp index a90451a3f3a..9716acd86cb 100644 --- a/test/solver/bicgstab_kernels.cpp +++ b/test/solver/bicgstab_kernels.cpp @@ -245,6 +245,48 @@ TEST_F(Bicgstab, BicgstabStep3IsEquivalentToRef) } +TEST_F(Bicgstab, BicgstabFinalizeIsEquivalentToRefWithoutRaceCondition) +{ + /** + * This test is designed to detect the following problem. Originally, we + * assigned threads per value to update the value and the stop status if the + * stop status is stopped but not finished yet. However, it leads to race + * conditions. If all threads see stop status before the update, all values + * will be correctly updated. It is also possible that some threads already + * finalize the stop status, but the rest see the stop status as finalized + * such that they will not update the value. We make this test case large to + * trigger this race condition more easily. However, it is not guaranteed to + * fail with the old version because of race conditions. + */ + int m = 1e6; + int n = 2; + x = gen_mtx(m, n, n); + y = gen_mtx(m, n, n); + alpha = gen_mtx(1, n, n); + d_x = x->clone(exec); + d_y = y->clone(exec); + d_alpha = alpha->clone(exec); + stop_status = std::make_unique>(ref, n); + for (size_t i = 0; i < n; ++i) { + stop_status->get_data()[i].reset(); + } + // check correct handling for stopped columns + stop_status->get_data()[1].stop(1); + // finalize only update the stopped one but not finished yet + stop_status->get_data()[0].stop(1, false); + d_stop_status = + std::make_unique>(exec, *stop_status); + + gko::kernels::reference::bicgstab::finalize(ref, x.get(), y.get(), + alpha.get(), stop_status.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::finalize( + exec, d_x.get(), d_y.get(), d_alpha.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, ::r::value); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); +} + + TEST_F(Bicgstab, BicgstabApplyOneRHSIsEquivalentToRef) { int m = 123; From 2c439eecd08d75e27aa5364566e39eb55f03de73 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 5 Sep 2024 18:35:12 +0200 Subject: [PATCH 177/448] fix the race condition --- common/unified/solver/bicgstab_kernels.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/common/unified/solver/bicgstab_kernels.cpp b/common/unified/solver/bicgstab_kernels.cpp index b696815f0d4..c403da3bf96 100644 --- a/common/unified/solver/bicgstab_kernels.cpp +++ b/common/unified/solver/bicgstab_kernels.cpp @@ -174,11 +174,18 @@ void finalize(std::shared_ptr exec, auto stop) { if (stop[col].has_stopped() && !stop[col].is_finalized()) { x(row, col) += alpha[col] * y(row, col); - stop[col].finalize(); } }, x->get_size(), y->get_stride(), x, default_stride(y), row_vector(alpha), *stop_status); + run_kernel( + exec, + [] GKO_KERNEL(auto col, auto stop) { + if (stop[col].has_stopped() && !stop[col].is_finalized()) { + stop[col].finalize(); + } + }, + x->get_size()[1], *stop_status); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL); From 76ce05d6515e545a73993292e12c53795cac97c8 Mon Sep 17 00:00:00 2001 From: nbeams <246972+nbeams@users.noreply.github.com> Date: Wed, 11 Sep 2024 15:22:49 +0000 Subject: [PATCH 178/448] GMRES: fix conj use in MGS dot product --- core/solver/gmres.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index e47714b2186..e066fc696a1 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -164,8 +164,8 @@ void orthogonalize_mgs(matrix::Dense* hessenberg_iter, krylov_bases, dim<2>{num_rows, num_rhs}, span{local_num_rows * i, local_num_rows * (i + 1)}, span{0, num_rhs}); - next_krylov->compute_conj_dot(krylov_basis, hessenberg_entry, - reduction_tmp); + krylov_basis->compute_conj_dot(next_krylov, hessenberg_entry, + reduction_tmp); next_krylov->sub_scaled(hessenberg_entry, krylov_basis); } } From 3e7fc2b60deedbbc5e1c16ae0aded1590f2c0edb Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 2 Oct 2024 15:47:38 +0200 Subject: [PATCH 179/448] [misc] fix typo --- include/ginkgo/core/base/utils_helper.hpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/ginkgo/core/base/utils_helper.hpp b/include/ginkgo/core/base/utils_helper.hpp index 3ea5c9d878d..1cd36cdadcb 100644 --- a/include/ginkgo/core/base/utils_helper.hpp +++ b/include/ginkgo/core/base/utils_helper.hpp @@ -95,32 +95,32 @@ using pointee = template -struct is_clonable_impl : std::false_type {}; +struct is_cloneable_impl : std::false_type {}; template -struct is_clonable_impl().clone())>> +struct is_cloneable_impl().clone())>> : std::true_type {}; template -constexpr bool is_clonable() +constexpr bool is_cloneable() { - return is_clonable_impl>::value; + return is_cloneable_impl>::value; } template -struct is_clonable_to_impl : std::false_type {}; +struct is_cloneable_to_impl : std::false_type {}; template -struct is_clonable_to_impl< +struct is_cloneable_to_impl< T, std::void_t().clone( std::declval>()))>> : std::true_type {}; template -constexpr bool is_clonable_to() +constexpr bool is_cloneable_to() { - return is_clonable_to_impl>::value; + return is_cloneable_to_impl>::value; } @@ -172,7 +172,7 @@ using shared_type = std::shared_ptr>; template inline detail::cloned_type clone(const Pointer& p) { - static_assert(detail::is_clonable>(), + static_assert(detail::is_cloneable>(), "Object is not clonable"); return detail::cloned_type( static_cast>::type*>( @@ -199,7 +199,7 @@ template inline detail::cloned_type clone(std::shared_ptr exec, const Pointer& p) { - static_assert(detail::is_clonable_to>(), + static_assert(detail::is_cloneable_to>(), "Object is not clonable"); return detail::cloned_type( static_cast>::type*>( From a22ccbea83915096ca7759d461f6369430c6630b Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 2 Oct 2024 15:57:03 +0200 Subject: [PATCH 180/448] [misc] fix typo --- core/test/base/utils.cpp | 32 +++++++++++------------ include/ginkgo/core/base/utils_helper.hpp | 4 +-- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/core/test/base/utils.cpp b/core/test/base/utils.cpp index 1ad4705b824..5c6947b7cc6 100644 --- a/core/test/base/utils.cpp +++ b/core/test/base/utils.cpp @@ -83,19 +83,19 @@ TEST(PointerParam, WorksForUniquePointers) } -struct ClonableDerived : Base { - ClonableDerived(std::shared_ptr exec = nullptr) +struct CloneableDerived : Base { + CloneableDerived(std::shared_ptr exec = nullptr) : executor(exec) {} std::unique_ptr clone() { - return std::unique_ptr(new ClonableDerived()); + return std::unique_ptr(new CloneableDerived()); } std::unique_ptr clone(std::shared_ptr exec) { - return std::unique_ptr(new ClonableDerived{exec}); + return std::unique_ptr(new CloneableDerived{exec}); } std::shared_ptr executor; @@ -104,36 +104,36 @@ struct ClonableDerived : Base { TEST(Clone, ClonesUniquePointer) { - std::unique_ptr p(new ClonableDerived()); + std::unique_ptr p(new CloneableDerived()); auto clone = gko::clone(p); ::testing::StaticAssertTypeEq>(); + std::unique_ptr>(); ASSERT_NE(p.get(), clone.get()); } TEST(Clone, ClonesSharedPointer) { - std::shared_ptr p(new ClonableDerived()); + std::shared_ptr p(new CloneableDerived()); auto clone = gko::clone(p); ::testing::StaticAssertTypeEq>(); + std::unique_ptr>(); ASSERT_NE(p.get(), clone.get()); } TEST(Clone, ClonesPlainPointer) { - std::unique_ptr p(new ClonableDerived()); + std::unique_ptr p(new CloneableDerived()); auto clone = gko::clone(p.get()); ::testing::StaticAssertTypeEq>(); + std::unique_ptr>(); ASSERT_NE(p.get(), clone.get()); } @@ -141,12 +141,12 @@ TEST(Clone, ClonesPlainPointer) TEST(CloneTo, ClonesUniquePointer) { auto exec = gko::ReferenceExecutor::create(); - std::unique_ptr p(new ClonableDerived()); + std::unique_ptr p(new CloneableDerived()); auto clone = gko::clone(exec, p); ::testing::StaticAssertTypeEq>(); + std::unique_ptr>(); ASSERT_NE(p.get(), clone.get()); ASSERT_EQ(clone->executor, exec); } @@ -155,12 +155,12 @@ TEST(CloneTo, ClonesUniquePointer) TEST(CloneTo, ClonesSharedPointer) { auto exec = gko::ReferenceExecutor::create(); - std::shared_ptr p(new ClonableDerived()); + std::shared_ptr p(new CloneableDerived()); auto clone = gko::clone(exec, p); ::testing::StaticAssertTypeEq>(); + std::unique_ptr>(); ASSERT_NE(p.get(), clone.get()); ASSERT_EQ(clone->executor, exec); } @@ -169,12 +169,12 @@ TEST(CloneTo, ClonesSharedPointer) TEST(CloneTo, ClonesPlainPointer) { auto exec = gko::ReferenceExecutor::create(); - std::unique_ptr p(new ClonableDerived()); + std::unique_ptr p(new CloneableDerived()); auto clone = gko::clone(exec, p.get()); ::testing::StaticAssertTypeEq>(); + std::unique_ptr>(); ASSERT_NE(p.get(), clone.get()); ASSERT_EQ(clone->executor, exec); } diff --git a/include/ginkgo/core/base/utils_helper.hpp b/include/ginkgo/core/base/utils_helper.hpp index 1cd36cdadcb..951ea4bbf5d 100644 --- a/include/ginkgo/core/base/utils_helper.hpp +++ b/include/ginkgo/core/base/utils_helper.hpp @@ -173,7 +173,7 @@ template inline detail::cloned_type clone(const Pointer& p) { static_assert(detail::is_cloneable>(), - "Object is not clonable"); + "Object is not cloneable"); return detail::cloned_type( static_cast>::type*>( p->clone().release())); @@ -200,7 +200,7 @@ inline detail::cloned_type clone(std::shared_ptr exec, const Pointer& p) { static_assert(detail::is_cloneable_to>(), - "Object is not clonable"); + "Object is not cloneable"); return detail::cloned_type( static_cast>::type*>( p->clone(std::move(exec)).release())); From fd24e5d0d25a90f8d4eed21ce67d7ec6dcf62ff1 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 8 Oct 2024 15:40:34 +0200 Subject: [PATCH 181/448] add workspace for reduction usage --- core/stop/residual_norm.cpp | 18 ++++++++++++------ include/ginkgo/core/stop/residual_norm.hpp | 2 ++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/core/stop/residual_norm.cpp b/core/stop/residual_norm.cpp index adf7da3e2e6..4e73cc8d56a 100644 --- a/core/stop/residual_norm.cpp +++ b/core/stop/residual_norm.cpp @@ -98,7 +98,8 @@ ResidualNormBase::ResidualNormBase( system_matrix_{args.system_matrix}, b_{args.b}, one_{gko::initialize({1}, exec)}, - neg_one_{gko::initialize({-1}, exec)} + neg_one_{gko::initialize({-1}, exec)}, + reduction_tmp_{exec} { switch (baseline_) { case mode::initial_resnorm: { @@ -113,7 +114,8 @@ ResidualNormBase::ResidualNormBase( args.system_matrix->apply(neg_one_, args.x, one_, b_clone); norm_dispatch( [&](auto dense_r) { - dense_r->compute_norm2(this->starting_tau_); + dense_r->compute_norm2(this->starting_tau_, + reduction_tmp_); }, b_clone.get()); } @@ -122,7 +124,7 @@ ResidualNormBase::ResidualNormBase( exec, dim<2>{1, args.initial_residual->get_size()[1]}); norm_dispatch( [&](auto dense_r) { - dense_r->compute_norm2(this->starting_tau_); + dense_r->compute_norm2(this->starting_tau_, reduction_tmp_); }, args.initial_residual); } @@ -135,7 +137,9 @@ ResidualNormBase::ResidualNormBase( this->starting_tau_ = NormVector::create(exec, dim<2>{1, args.b->get_size()[1]}); norm_dispatch( - [&](auto dense_r) { dense_r->compute_norm2(this->starting_tau_); }, + [&](auto dense_r) { + dense_r->compute_norm2(this->starting_tau_, reduction_tmp_); + }, args.b.get()); break; } @@ -169,7 +173,9 @@ bool ResidualNormBase::check_impl( return false; } else if (updater.residual_ != nullptr) { norm_dispatch( - [&](auto dense_r) { dense_r->compute_norm2(u_dense_tau_); }, + [&](auto dense_r) { + dense_r->compute_norm2(u_dense_tau_, reduction_tmp_); + }, updater.residual_); dense_tau = u_dense_tau_.get(); } else if (updater.solution_ != nullptr && system_matrix_ != nullptr && @@ -179,7 +185,7 @@ bool ResidualNormBase::check_impl( [&](auto dense_b, auto dense_x) { auto dense_r = dense_b->clone(); system_matrix_->apply(neg_one_, dense_x, one_, dense_r); - dense_r->compute_norm2(u_dense_tau_); + dense_r->compute_norm2(u_dense_tau_, reduction_tmp_); }, b_.get(), updater.solution_); dense_tau = u_dense_tau_.get(); diff --git a/include/ginkgo/core/stop/residual_norm.hpp b/include/ginkgo/core/stop/residual_norm.hpp index 6ee3c843e6a..7ee020207d4 100644 --- a/include/ginkgo/core/stop/residual_norm.hpp +++ b/include/ginkgo/core/stop/residual_norm.hpp @@ -82,6 +82,8 @@ class ResidualNormBase /* one/neg_one for residual computation */ std::shared_ptr one_{}; std::shared_ptr neg_one_{}; + // workspace for reduction + mutable gko::array reduction_tmp_; }; From 244b5a956a74e7ffc9fccc90ca68be7de1fa0765 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 2 Oct 2024 15:31:04 +0200 Subject: [PATCH 182/448] [ci] disable horeka CI jobs --- .gitlab-ci.yml | 91 +++-------------------------------------------- .gitlab/image.yml | 6 ---- .gitlab/rules.yml | 5 +++ 3 files changed, 9 insertions(+), 93 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 226a10f4cea..d6ba260f75d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -91,91 +91,6 @@ trigger_pipeline: fi -# Build jobs -# Job with example runs. -# cuda 11.0 and friends on HoreKa with tests -build/cuda110/mvapich2/gcc/cuda/debug/shared: - extends: - - .build_template - - .default_variables - - .quick_test_condition - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 - variables: - BUILD_OMP: "ON" - BUILD_CUDA: "ON" - BUILD_MPI: "ON" - BUILD_TYPE: "Debug" - FAST_TESTS: "ON" - NONDEFAULT_STREAM: "ON" - CUDA_ARCH: 80 - USE_NAME: "cuda110-mvapich2-gcc-${CI_PIPELINE_ID}" - KEEP_CONTAINER: "ON" - USE_SLURM: 0 - -test/cuda110/mvapich2/gcc/cuda/debug/shared: - extends: - - .horeka_test_template - - .default_variables - - .quick_test_condition - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 - variables: - USE_NAME: "cuda110-mvapich2-gcc-${CI_PIPELINE_ID}" - SLURM_PARTITION: "accelerated" - SLURM_GRES: "gpu:4" - SLURM_TIME: "02:00:00" - dependencies: null - needs: [ "build/cuda110/mvapich2/gcc/cuda/debug/shared" ] - - -build/cuda110/nompi/clang/cuda/release/static: - extends: - - .build_template - - .default_variables - - .full_test_condition - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 - variables: - CXX_COMPILER: "clang++" - BUILD_OMP: "ON" - BUILD_CUDA: "ON" - BUILD_TYPE: "Release" - BUILD_SHARED_LIBS: "OFF" - CUDA_ARCH: 80 - USE_NAME: "cuda110-nompi-clang-${CI_PIPELINE_ID}" - KEEP_CONTAINER: "ON" - USE_SLURM: 0 - -test/cuda110/nompi/clang/cuda/release/static: - extends: - - .horeka_test_template - - .default_variables - - .full_test_condition - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 - variables: - USE_NAME: "cuda110-nompi-clang-${CI_PIPELINE_ID}" - SLURM_PARTITION: "accelerated" - SLURM_GRES: "gpu:4" - SLURM_TIME: "01:30:00" - dependencies: null - needs: [ "build/cuda110/nompi/clang/cuda/release/static" ] - - -build/cuda110/nompi/clang/cuda/release/shared: - extends: - - .build_template - - .default_variables - - .quick_test_condition - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 - variables: - CXX_COMPILER: "clang++" - CUDA_ARCH: 52 - BUILD_OMP: "ON" - BUILD_CUDA: "ON" - BUILD_TYPE: "Release" - FAST_TESTS: "ON" - # disable spurious unused argument warning - EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177" - - # cuda 11.4 and friends build/cuda114/nompi/gcc/cuda/debug/shared: extends: @@ -764,8 +679,9 @@ benchmark-cuda-spmv-build: extends: - .build_template - .default_variables - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 + - .use_gko_cuda114-openmpi-gnu10-llvm12 - .benchmark-spmv-cuda-rules + - .disable_job_condition stage: benchmark-build variables: BUILD_OMP: "ON" @@ -785,8 +701,9 @@ benchmark-cuda-spmv: extends: - .benchmark_template - .default_variables - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 + - .use_gko_cuda114-openmpi-gnu10-llvm12 - .benchmark-spmv-cuda-rules + - .disable_job_condition stage: benchmark-cuda variables: BENCHMARK_REPO: git@github.com:ginkgo-project/ginkgo-data.git diff --git a/.gitlab/image.yml b/.gitlab/image.yml index 60521044d7f..2295f6312ae 100644 --- a/.gitlab/image.yml +++ b/.gitlab/image.yml @@ -17,12 +17,6 @@ - cpu - amdci -.use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020: - image: ginkgohub/cuda:110-mvapich2-gnu9-llvm9-intel2020 - tags: - - private_ci - - horeka - .use_gko_cuda114-openmpi-gnu10-llvm12: image: ginkgohub/cuda:114-openmpi-gnu10-llvm12 tags: diff --git a/.gitlab/rules.yml b/.gitlab/rules.yml index 0280017c08b..4afc04799bb 100644 --- a/.gitlab/rules.yml +++ b/.gitlab/rules.yml @@ -59,3 +59,8 @@ # - common/unified/matrix/* # for now no SpMV there? when: manual allow_failure: true + + +.disable_job_condition: + rules: + - when: never From 6b96a374328dbc763568210020be1852ce575961 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 2 Oct 2024 15:37:37 +0200 Subject: [PATCH 183/448] [ci] delete github windows jobs --- .github/workflows/windows-mingw.yml | 65 ------------------------- .github/workflows/windows-msvc-cuda.yml | 62 ----------------------- .github/workflows/windows-msvc-ref.yml | 62 ----------------------- 3 files changed, 189 deletions(-) delete mode 100644 .github/workflows/windows-mingw.yml delete mode 100644 .github/workflows/windows-msvc-cuda.yml delete mode 100644 .github/workflows/windows-msvc-ref.yml diff --git a/.github/workflows/windows-mingw.yml b/.github/workflows/windows-mingw.yml deleted file mode 100644 index 1c859661562..00000000000 --- a/.github/workflows/windows-mingw.yml +++ /dev/null @@ -1,65 +0,0 @@ -name: Windows-MinGW - -on: - push: - branches: - - 'master' - - 'develop' - - 'release/**' - tags: - - '**' - pull_request: - types: [opened,synchronize] - workflow_dispatch: - inputs: - debug_enabled: - description: 'Run the build with tmate debugging enabled by `debug_enabled` keyword (https://github.com/marketplace/actions/debugging-with-tmate)' - required: false - default: false - -concurrency: - group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }} - cancel-in-progress: true - -jobs: - windows_mingw: - if: ${{ false }} - strategy: - fail-fast: false - matrix: - config: - - {shared: "OFF", build_type: "Release", name: "omp/release/static", cflags: ""} - name: mingw/${{ matrix.config.name }} - runs-on: [windows-latest] - steps: - - name: Checkout the latest code (shallow clone) - uses: actions/checkout@v4 - - - name: Debug over SSH (tmate) - uses: mxschmitt/action-tmate@v3.5 - if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }} - with: - limit-access-to-actor: true - - - name: configure - # Use cmd to remove the path easily - run: | - bcdedit /set IncreaseUserVa 3072 - editbin /LARGEADDRESSAWARE "C:\Program Files\Git\mingw64\bin\cc1plus.exe" - set PATH=C:\Program Files\Git\mingw64\bin;%PATH% - set PATH=C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw64\bin;%PATH% - mkdir build - cd build - cmake -G "MinGW Makefiles" -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DCMAKE_CXX_FLAGS=${{ matrix.config.cflags }} .. - cmake --build . -j4 - shell: cmd - - - name: install - run: | - set PATH=C:\Program Files\Git\mingw64\bin;%PATH% - set PATH=C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw64\bin;%PATH% - set PATH=C:\Program Files (x86)\Ginkgo\bin;%PATH% - cd build - cmake --install . - cmake --build . --target test_install - shell: cmd diff --git a/.github/workflows/windows-msvc-cuda.yml b/.github/workflows/windows-msvc-cuda.yml deleted file mode 100644 index efa637b2bf9..00000000000 --- a/.github/workflows/windows-msvc-cuda.yml +++ /dev/null @@ -1,62 +0,0 @@ -name: Windows-MSVC-CUDA (compile-only) - -on: - push: - branches: - - 'master' - - 'develop' - - 'release/**' - tags: - - '**' - pull_request: - types: [opened,synchronize] - workflow_dispatch: - inputs: - debug_enabled: - description: 'Run the build with tmate debugging enabled by `debug_enabled` keyword (https://github.com/marketplace/actions/debugging-with-tmate)' - required: false - default: false - -concurrency: - group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }} - cancel-in-progress: true - -jobs: - windows_cuda: - if: ${{ false }} - strategy: - fail-fast: false - matrix: - config: - - {version: "latest", name: "cuda-latest/release/shared", "mixed": "ON"} - name: msvc/${{ matrix.config.name }} (only compile) - runs-on: [windows-2019] - - steps: - - name: Checkout the latest code (shallow clone) - uses: actions/checkout@v4 - - name: setup (versioned) - if: matrix.config.version != 'latest' - run: | - choco install cuda --version=${{ matrix.config.version }} -y - - - name: setup (latest) - if: matrix.config.version == 'latest' - run: | - choco install cuda -y - - - name: Debug over SSH (tmate) - uses: mxschmitt/action-tmate@v3.5 - if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }} - with: - limit-access-to-actor: true - - - name: configure - run: | - $env:ChocolateyInstall = Convert-Path "$((Get-Command choco).Path)\..\.." - Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1" - refreshenv - mkdir build - cd build - cmake -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_CUDA_ARCHITECTURES=60 .. - cmake --build . -j4 --config Release diff --git a/.github/workflows/windows-msvc-ref.yml b/.github/workflows/windows-msvc-ref.yml deleted file mode 100644 index 60a811bb99b..00000000000 --- a/.github/workflows/windows-msvc-ref.yml +++ /dev/null @@ -1,62 +0,0 @@ -name: Windows-MSVC-Reference - -on: - push: - branches: - - 'master' - - 'develop' - - 'release/**' - tags: - - '**' - pull_request: - types: [opened,synchronize] - workflow_dispatch: - inputs: - debug_enabled: - description: 'Run the build with tmate debugging enabled by `debug_enabled` keyword (https://github.com/marketplace/actions/debugging-with-tmate)' - required: false - default: false - -concurrency: - group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }} - cancel-in-progress: true - -jobs: - windows_ref: - if: ${{ false }} - strategy: - fail-fast: false - matrix: - config: - # Debug shared exceeds symbol limit - # - {shared: "ON", build_type: "Debug", name: "reference/debug/shared"} - - {shared: "OFF", build_type: "Release", name: "reference/release/static"} - - {shared: "ON", build_type: "Release", name: "reference/release/shared"} - # Debug static needs too much storage - # - {shared: "OFF", build_type: "Debug", name: "reference/debug/static"} - name: msvc/${{ matrix.config.name }} - runs-on: [windows-latest] - steps: - - name: Checkout the latest code (shallow clone) - uses: actions/checkout@v4 - - - name: Debug over SSH (tmate) - uses: mxschmitt/action-tmate@v3.5 - if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }} - with: - limit-access-to-actor: true - - - name: configure - run: | - mkdir build - cd build - cmake -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG='/MDd /Zi /Ob1 /O1 /Od /RTC1' -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF .. - cmake --build . -j4 --config ${{ matrix.config.build_type }} - ctest . -C ${{ matrix.config.build_type }} --output-on-failure - - - name: install - run: | - $env:PATH="$env:PATH;C:\Program Files (x86)\Ginkgo\bin" - cd build - cmake --install . --config ${{ matrix.config.build_type }} - cmake --build . --target test_install --config ${{ matrix.config.build_type }} From 1deb9a29e17e45671c47b756f06beb0bbffc505d Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 7 Oct 2024 11:37:54 +0000 Subject: [PATCH 184/448] [ci] use nla-gpu for QoS jobs --- .gitlab-ci.yml | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d6ba260f75d..ef10b92e20d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -433,10 +433,9 @@ warnings: - .build_template - .default_variables - .full_test_condition - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 + - .use_gko-rocm514-nompi-gnu11-llvm11 variables: BUILD_OMP: "ON" - BUILD_CUDA: "ON" CXX_FLAGS: "-Werror=pedantic -pedantic-errors" allow_failure: yes @@ -447,10 +446,9 @@ no-circular-deps: - .build_template - .default_variables - .quick_test_condition - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 + - .use_gko-rocm514-nompi-gnu11-llvm11 variables: BUILD_OMP: "ON" - BUILD_CUDA: "ON" EXTRA_CMAKE_FLAGS: '-DGINKGO_CHECK_CIRCULAR_DEPS=on' allow_failure: no @@ -474,10 +472,9 @@ clang-tidy: - .build_template - .default_variables - .full_test_condition - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 + - .use_gko-rocm514-nompi-gnu11-llvm11 variables: BUILD_OMP: "ON" - BUILD_CUDA: "ON" EXTRA_CMAKE_FLAGS: '-DGINKGO_WITH_CLANG_TIDY=ON' allow_failure: yes @@ -487,10 +484,9 @@ iwyu: - .build_template - .default_variables - .full_test_condition - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 + - .use_gko-rocm514-nompi-gnu11-llvm11 variables: BUILD_OMP: "ON" - BUILD_CUDA: "ON" EXTRA_CMAKE_FLAGS: '-DGINKGO_WITH_IWYU=ON' allow_failure: yes @@ -502,7 +498,7 @@ sonarqube_cov_: - .default_variables - .quick_test_short_lived_condition - .before_script_template - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 + - .use_gko_cuda114-openmpi-gnu10-llvm12 tags: - private_ci - controller @@ -538,7 +534,7 @@ sonarqube_cov: - .default_variables - .deploy_condition - .before_script_template - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 + - .use_gko_cuda114-openmpi-gnu10-llvm12 tags: - private_ci - controller @@ -601,7 +597,7 @@ threadsanitizer: - .default_variables - .deploy_condition - .before_script_template - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 + - .use_gko-rocm514-nompi-gnu11-llvm11 script: - LD_PRELOAD=/usr/local/lib/libomp.so CC=clang CXX=clang++ @@ -616,7 +612,7 @@ leaksanitizer: - .default_variables - .deploy_condition - .before_script_template - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 + - .use_gko-rocm514-nompi-gnu11-llvm11 script: - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=LSAN -DCTEST_MEMORYCHECK_TYPE=LeakSanitizer @@ -627,7 +623,7 @@ addresssanitizer: - .default_variables - .deploy_condition - .before_script_template - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 + - .use_gko-rocm514-nompi-gnu11-llvm11 script: - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=ASAN -DCTEST_MEMORYCHECK_TYPE=AddressSanitizer @@ -638,7 +634,7 @@ undefinedsanitizer: - .default_variables - .deploy_condition - .before_script_template - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 + - .use_gko-rocm514-nompi-gnu11-llvm11 script: # the Gold linker is required because of a linker flag issues given by UBsan # in the Ubuntu setup we are using. From 41b535a2c8cdea24ed2336db17a2f540bdf67a87 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 9 Oct 2024 07:33:40 +0000 Subject: [PATCH 185/448] [ci] fix circular-deps issues --- benchmark/CMakeLists.txt | 12 ++++++------ reference/solver/batch_bicgstab_kernels.hpp | 2 ++ reference/solver/batch_cg_kernels.hpp | 2 ++ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index e2479e02344..55ed76d1613 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -21,7 +21,7 @@ function(ginkgo_benchmark_cusparse_linops type def) # make the dependency public to catch issues target_compile_definitions(cusparse_linops_${type} PUBLIC ${def}) target_compile_definitions(cusparse_linops_${type} PRIVATE GKO_COMPILING_CUDA) - target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse) + target_link_libraries(cusparse_linops_${type} PRIVATE Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse) ginkgo_compile_features(cusparse_linops_${type}) endfunction() @@ -31,7 +31,7 @@ function(ginkgo_benchmark_hipsparse_linops type def) target_compile_definitions(hipsparse_linops_${type} PUBLIC ${def}) target_compile_definitions(hipsparse_linops_${type} PRIVATE GKO_COMPILING_HIP) target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS}) - target_link_libraries(hipsparse_linops_${type} Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES}) + target_link_libraries(hipsparse_linops_${type} PRIVATE Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES}) ginkgo_compile_features(hipsparse_linops_${type}) endfunction() @@ -118,7 +118,7 @@ if (GINKGO_BUILD_CUDA) ginkgo_benchmark_cusparse_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) ginkgo_benchmark_cusparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) add_library(cuda_timer utils/cuda_timer.cpp) - target_link_libraries(cuda_timer ginkgo CUDA::cudart) + target_link_libraries(cuda_timer PRIVATE ginkgo CUDA::cudart) ginkgo_compile_features(cuda_timer) endif() if (GINKGO_BUILD_HIP) @@ -128,7 +128,7 @@ if (GINKGO_BUILD_HIP) ginkgo_benchmark_hipsparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) set_source_files_properties(utils/hip_timer.hip.cpp PROPERTIES LANGUAGE HIP) add_library(hip_timer utils/hip_timer.hip.cpp) - target_link_libraries(hip_timer ginkgo) + target_link_libraries(hip_timer PRIVATE ginkgo) ginkgo_compile_features(hip_timer) endif() @@ -140,13 +140,13 @@ if (GINKGO_BUILD_SYCL) add_library(dpcpp_timer utils/dpcpp_timer.dp.cpp) target_compile_options(dpcpp_timer PRIVATE ${GINKGO_DPCPP_FLAGS}) gko_add_sycl_to_target(TARGET dpcpp_timer SOURCES utils/dpcpp_timer.dp.cpp) - target_link_libraries(dpcpp_timer ginkgo) + target_link_libraries(dpcpp_timer PRIVATE ginkgo) ginkgo_compile_features(dpcpp_timer) endif() if (GINKGO_BUILD_MPI) add_library(mpi_timer ${Ginkgo_SOURCE_DIR}/benchmark/utils/mpi_timer.cpp) - target_link_libraries(mpi_timer ginkgo) + target_link_libraries(mpi_timer PRIVATE ginkgo) ginkgo_compile_features(mpi_timer) endif() diff --git a/reference/solver/batch_bicgstab_kernels.hpp b/reference/solver/batch_bicgstab_kernels.hpp index f91e06d2e44..85b1bed5ccd 100644 --- a/reference/solver/batch_bicgstab_kernels.hpp +++ b/reference/solver/batch_bicgstab_kernels.hpp @@ -6,6 +6,8 @@ #define GKO_REFERENCE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ +#include "core/solver/batch_bicgstab_kernels.hpp" + #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_multi_vector_kernels.hpp" diff --git a/reference/solver/batch_cg_kernels.hpp b/reference/solver/batch_cg_kernels.hpp index d4a35e3d01a..2f8e5990931 100644 --- a/reference/solver/batch_cg_kernels.hpp +++ b/reference/solver/batch_cg_kernels.hpp @@ -6,6 +6,8 @@ #define GKO_REFERENCE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ +#include "core/solver/batch_cg_kernels.hpp" + #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "reference/base/batch_multi_vector_kernels.hpp" From 9d0f7cf397e566e9f43d86d267cfc19949854edd Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 11 Oct 2024 10:00:02 +0200 Subject: [PATCH 186/448] [doc] remove windows ci badge AFAIK there is no way to get a gitlab badge for a specific job, so there is no replacement. --- README.md | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 74fd6a0f57e..598b17e5b5b 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,8 @@ |:-:|:-:|:-:|:-:| -[![Build status](https://gitlab.com/ginkgo-project/ginkgo-public-ci/badges/master/pipeline.svg)](https://gitlab.com/ginkgo-project/ginkgo-public-ci/-/pipelines?page=1&scope=branches&ref=master)|[![OSX-build](https://github.com/ginkgo-project/ginkgo/actions/workflows/osx.yml/badge.svg)](https://github.com/ginkgo-project/ginkgo/actions/workflows/osx.yml)|[![Windows-build](https://github.com/ginkgo-project/ginkgo/actions/workflows/windows-msvc-ref.yml/badge.svg)](https://github.com/ginkgo-project/ginkgo/actions/workflows/windows-msvc-ref.yml) -|:-:|:-:|:-:| - - -[![codecov](https://codecov.io/gh/ginkgo-project/ginkgo/branch/master/graph/badge.svg)](https://codecov.io/gh/ginkgo-project/ginkgo)|[![Maintainability Rating](https://sonarcloud.io/api/project_badges/measure?project=ginkgo-project_ginkgo&metric=sqale_rating)](https://sonarcloud.io/dashboard?id=ginkgo-project_ginkgo)|[![Reliability Rating](https://sonarcloud.io/api/project_badges/measure?project=ginkgo-project_ginkgo&metric=reliability_rating)](https://sonarcloud.io/dashboard?id=ginkgo-project_ginkgo)|[![CDash dashboard](https://img.shields.io/badge/CDash-Access-blue.svg)](https://my.cdash.org/index.php?project=Ginkgo+Project) -|:-:|:-:|:-:|:-:| +[![Build status](https://gitlab.com/ginkgo-project/ginkgo-public-ci/badges/master/pipeline.svg)](https://gitlab.com/ginkgo-project/ginkgo-public-ci/-/pipelines?page=1&scope=branches&ref=master)|[![OSX-build](https://github.com/ginkgo-project/ginkgo/actions/workflows/osx.yml/badge.svg)](https://github.com/ginkgo-project/ginkgo/actions/workflows/osx.yml)|[![codecov](https://codecov.io/gh/ginkgo-project/ginkgo/branch/master/graph/badge.svg)](https://codecov.io/gh/ginkgo-project/ginkgo)|[![Maintainability Rating](https://sonarcloud.io/api/project_badges/measure?project=ginkgo-project_ginkgo&metric=sqale_rating)](https://sonarcloud.io/dashboard?id=ginkgo-project_ginkgo)|[![Reliability Rating](https://sonarcloud.io/api/project_badges/measure?project=ginkgo-project_ginkgo&metric=reliability_rating)](https://sonarcloud.io/dashboard?id=ginkgo-project_ginkgo)|[![CDash dashboard](https://img.shields.io/badge/CDash-Access-blue.svg)](https://my.cdash.org/index.php?project=Ginkgo+Project) +|:-:|:-:|:-:|:-:|:-:|:-:| From 9566b2232cdf11acdbaafeb3c9dd1da0ff870929 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 11 Oct 2024 11:04:43 +0200 Subject: [PATCH 187/448] [ci] fix cuda memcheck --- cmake/CTestScript.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/CTestScript.cmake b/cmake/CTestScript.cmake index 81ff86625d1..c24e4e4529a 100644 --- a/cmake/CTestScript.cmake +++ b/cmake/CTestScript.cmake @@ -153,11 +153,11 @@ ctest_submit(PARTS Start) if (CTEST_MEMORYCHECK_TYPE STREQUAL "CudaMemcheck") # generate line number information for CUDA - set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=OFF;-DGINKGO_BUILD_CUDA=ON;-DGINKGO_BUILD_HIP=ON;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION};-DCMAKE_CUDA_FLAGS=-lineinfo") + set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=OFF;-DGINKGO_BUILD_CUDA=ON;-DGINKGO_BUILD_HIP=OFF;-DGINKGO_BUILD_SYCL=OFF;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION};-DCMAKE_CUDA_FLAGS=-lineinfo") elseif((NOT CTEST_MEMORYCHECK_TYPE STREQUAL "NONE" AND NOT CTEST_MEMORYCHECK_TYPE STREQUAL "Valgrind") OR CTEST_BUILD_CONFIGURATION STREQUAL "COVERAGE") - set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DGINKGO_BUILD_CUDA=OFF;-DGINKGO_BUILD_HIP=OFF;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}") + set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DGINKGO_BUILD_CUDA=OFF;-DGINKGO_BUILD_HIP=OFF;-DGINKGO_BUILD_SYCL=OFF;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}") else() - set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DGINKGO_BUILD_CUDA=ON;-DGINKGO_BUILD_HIP=ON;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}") + set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DGINKGO_BUILD_CUDA=ON;-DGINKGO_BUILD_HIP=ON;-DGINKGO_BUILD_SYCL=OFF;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}") endif() # UBSAN needs gold linker From b2e39cccc17c26a0c7ba885bbb25ecd6d1e7031a Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 14 Oct 2024 15:07:29 +0200 Subject: [PATCH 188/448] [ci] move the majority of jobs to the full pipeline --- .gitlab-ci.yml | 28 ++++++++++++++-------------- .gitlab/rules.yml | 12 +++++++----- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ef10b92e20d..18771d9bc2d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -92,7 +92,7 @@ trigger_pipeline: # cuda 11.4 and friends -build/cuda114/nompi/gcc/cuda/debug/shared: +build/cuda114/nompi/gcc/cuda/release/shared: extends: - .build_and_test_template - .default_variables @@ -101,7 +101,7 @@ build/cuda114/nompi/gcc/cuda/debug/shared: variables: BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_TYPE: "Debug" + BUILD_TYPE: "Release" FAST_TESTS: "ON" # fix gtest issue https://github.com/google/googletest/issues/3514 CXX_FLAGS: "-Wno-error=maybe-uninitialized" @@ -114,7 +114,7 @@ build/nvhpc233/cuda120/nompi/nvcpp/release/static: extends: - .build_and_test_template - .default_variables - - .quick_test_condition + - .full_test_condition - .use_gko_nvhpc233-cuda120-openmpi-gnu12-llvm16 variables: CXX_COMPILER: "nvc++" @@ -133,7 +133,7 @@ build/nvhpc227/cuda117/nompi/nvcpp/debug/shared: extends: - .build_and_test_template - .default_variables - - .quick_test_condition + - .full_test_condition - .use_gko_nvhpc227-cuda117-openmpi-gnu11-llvm14 variables: CXX_COMPILER: "nvc++" @@ -178,7 +178,7 @@ build/amd/nompi/clang/rocm45/debug/shared: extends: - .build_and_test_template - .default_variables - - .quick_test_condition + - .full_test_condition - .use_gko-rocm45-nompi-gnu8-llvm8 variables: CXX_COMPILER: "clang++" @@ -203,7 +203,7 @@ build/amd/nompi/clang/rocm514/release/shared: extends: - .build_and_test_template - .default_variables - - .quick_test_condition + - .full_test_condition - .use_gko-rocm514-nompi-gnu11-llvm11 variables: CXX_COMPILER: "clang++" @@ -229,7 +229,7 @@ build/nocuda/nompi/gcc/core/debug/static: extends: - .build_and_test_template - .default_variables - - .quick_test_condition + - .full_test_condition - .use_gko-nocuda-nompi-gnu9-llvm8 variables: BUILD_TYPE: "Debug" @@ -241,7 +241,7 @@ build/nocuda/nompi/clang/core/release/shared: extends: - .build_and_test_template - .default_variables - - .quick_test_condition + - .full_test_condition - .use_gko-nocuda-nompi-gnu9-llvm8 variables: CXX_COMPILER: "clang++" @@ -276,7 +276,7 @@ build/nocuda/openmpi/clang/omp/glibcxx-debug-release/shared: extends: - .build_and_test_template - .default_variables - - .quick_test_condition + - .full_test_condition - .use_gko-nocuda-nompi-gnu9-llvm8 variables: CXX_COMPILER: "clang++" @@ -292,7 +292,7 @@ build/nocuda/nompi/gcc/omp/release/static: extends: - .build_and_test_template - .default_variables - - .quick_test_condition + - .full_test_condition - .use_gko-nocuda-nompi-gnu9-llvm8 variables: BUILD_OMP: "ON" @@ -316,7 +316,7 @@ build/nocuda-nomixed/openmpi/gcc/omp/release/shared: extends: - .build_and_test_template - .default_variables - - .quick_test_condition + - .full_test_condition - .use_gko-nocuda-nompi-gnu9-llvm8 variables: BUILD_MPI: "ON" @@ -410,7 +410,7 @@ build/windows-cuda/release/shared: build/windows/release/shared: extends: - - .quick_test_condition + - .full_test_condition stage: build script: - if (Test-Path build) { rm -r -fo build } @@ -445,7 +445,7 @@ no-circular-deps: extends: - .build_template - .default_variables - - .quick_test_condition + - .full_test_condition - .use_gko-rocm514-nompi-gnu11-llvm11 variables: BUILD_OMP: "ON" @@ -496,7 +496,7 @@ sonarqube_cov_: stage: code_quality extends: - .default_variables - - .quick_test_short_lived_condition + - .full_test_short_lived_condition - .before_script_template - .use_gko_cuda114-openmpi-gnu10-llvm12 tags: diff --git a/.gitlab/rules.yml b/.gitlab/rules.yml index 4afc04799bb..e60aaf7a66c 100644 --- a/.gitlab/rules.yml +++ b/.gitlab/rules.yml @@ -30,18 +30,20 @@ dependencies: [] -.quick_test_condition: +.full_test_short_lived_condition: rules: - - if: $RUN_CI_TAG && $STATUS_CONTEXT == null + - if: $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_BRANCH == "master" || $CI_COMMIT_TAG + when: never + - if: $RUN_CI_TAG && $STATUS_CONTEXT == "full" dependencies: [] -.quick_test_short_lived_condition: + +.quick_test_condition: rules: - - if: $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_BRANCH == "master" || $CI_COMMIT_TAG - when: never - if: $RUN_CI_TAG && $STATUS_CONTEXT == null dependencies: [] + .deploy_condition: rules: - if: $RUN_CI_TAG && ($CI_COMMIT_BRANCH == "master" || $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_TAG) && $CI_PIPELINE_SOURCE != "schedule" From 700737511ebdf2540fe22aebfe357a37de93281c Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 29 Aug 2024 09:51:02 +0200 Subject: [PATCH 189/448] [cmake] remove maipulation of HIP|ROCM_PATH --- CMakeLists.txt | 1 - cmake/hip.cmake | 94 +++++--------------------------------------- cmake/hip_path.cmake | 13 ------ 3 files changed, 9 insertions(+), 99 deletions(-) delete mode 100644 cmake/hip_path.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 21832c98592..6d0804b4eed 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,6 @@ set(PROJECT_VERSION_TAG ${Ginkgo_VERSION_TAG}) set(THREADS_PREFER_PTHREAD_FLAG ON) # Determine which modules can be compiled -include(cmake/hip_path.cmake) include(cmake/autodetect_executors.cmake) list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/Modules/") diff --git a/cmake/hip.cmake b/cmake/hip.cmake index bd834c3ebde..52f377ad6ca 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -1,5 +1,10 @@ cmake_minimum_required(VERSION 3.21 FATAL_ERROR) enable_language(HIP) + +# We keep using NVCC/HCC for consistency with previous releases even if AMD +# updated everything to use NVIDIA/AMD in ROCM 4.1 +set(GINKGO_HIP_PLATFORM_NVCC 0) +set(GINKGO_HIP_PLATFORM_HCC 0) if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA") set(GINKGO_HIP_PLATFORM "nvidia") set(GINKGO_HIP_PLATFORM_NVIDIA ON) @@ -12,73 +17,6 @@ else() set(GINKGO_HIP_PLATFORM_HCC 1) endif() - -if(NOT DEFINED ROCM_PATH) - if(DEFINED ENV{ROCM_PATH}) - set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCM has been installed") - elseif(DEFINED ENV{HIP_PATH}) - set(ROCM_PATH "$ENV{HIP_PATH}/.." CACHE PATH "Path to which ROCM has been installed") - else() - set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCM has been installed") - endif() -endif() - -if(NOT DEFINED HIPBLAS_PATH) - if(DEFINED ENV{HIPBLAS_PATH}) - set(HIPBLAS_PATH $ENV{HIPBLAS_PATH} CACHE PATH "Path to which HIPBLAS has been installed") - else() - set(HIPBLAS_PATH "${ROCM_PATH}/hipblas" CACHE PATH "Path to which HIPBLAS has been installed") - endif() -endif() - -if(NOT DEFINED HIPFFT_PATH) - if(DEFINED ENV{HIPFFT_PATH}) - set(HIPFFT_PATH $ENV{HIPFFT_PATH} CACHE PATH "Path to which HIPFFT has been installed") - else() - set(HIPFFT_PATH "${ROCM_PATH}/hipfft" CACHE PATH "Path to which HIPFFT has been installed") - endif() -endif() - -if(NOT DEFINED HIPRAND_PATH) - if(DEFINED ENV{HIPRAND_PATH}) - set(HIPRAND_PATH $ENV{HIPRAND_PATH} CACHE PATH "Path to which HIPRAND has been installed") - else() - set(HIPRAND_PATH "${ROCM_PATH}/hiprand" CACHE PATH "Path to which HIPRAND has been installed") - endif() -endif() - -if(NOT DEFINED ROCRAND_PATH) - if(DEFINED ENV{ROCRAND_PATH}) - set(ROCRAND_PATH $ENV{ROCRAND_PATH} CACHE PATH "Path to which ROCRAND has been installed") - else() - set(ROCRAND_PATH "${ROCM_PATH}/rocrand" CACHE PATH "Path to which ROCRAND has been installed") - endif() -endif() - -if(NOT DEFINED HIPSPARSE_PATH) - if(DEFINED ENV{HIPSPARSE_PATH}) - set(HIPSPARSE_PATH $ENV{HIPSPARSE_PATH} CACHE PATH "Path to which HIPSPARSE has been installed") - else() - set(HIPSPARSE_PATH "${ROCM_PATH}/hipsparse" CACHE PATH "Path to which HIPSPARSE has been installed") - endif() -endif() - -if(NOT DEFINED HIP_CLANG_PATH) - if(NOT DEFINED ENV{HIP_CLANG_PATH}) - set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin" CACHE PATH "Path to which HIP compatible clang binaries have been installed") - else() - set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH} CACHE PATH "Path to which HIP compatible clang binaries have been installed") - endif() -endif() - -if(NOT DEFINED ROCTRACER_PATH) - if(DEFINED ENV{ROCTRACER_PATH}) - set(ROCTRACER_PATH $ENV{ROCTRACER_PATH} CACHE PATH "Path to which ROCTRACER has been installed") - else() - set(ROCTRACER_PATH "${ROCM_PATH}/roctracer" CACHE PATH "Path to which ROCTRACER has been installed") - endif() -endif() - find_program( HIP_HIPCONFIG_EXECUTABLE NAMES hipconfig @@ -97,24 +35,10 @@ if(NOT HIP_HIPCONFIG_EXECUTABLE) endif() execute_process( - COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --version - OUTPUT_VARIABLE GINKGO_HIP_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE - ERROR_STRIP_TRAILING_WHITESPACE - ) - -## Setup all CMAKE variables to find HIP and its dependencies -set(GINKGO_HIP_MODULE_PATH "${HIP_PATH}/cmake") -list(APPEND CMAKE_MODULE_PATH "${GINKGO_HIP_MODULE_PATH}") -if (GINKGO_HIP_PLATFORM_AND) - list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake") -endif() -list(APPEND CMAKE_PREFIX_PATH - "${HIPBLAS_PATH}/lib/cmake" - "${HIPFFT_PATH}/lib/cmake" - "${HIPRAND_PATH}/lib/cmake" - "${HIPSPARSE_PATH}/lib/cmake" - "${ROCRAND_PATH}/lib/cmake" + COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --version + OUTPUT_VARIABLE GINKGO_HIP_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_STRIP_TRAILING_WHITESPACE ) find_package(hipblas REQUIRED) diff --git a/cmake/hip_path.cmake b/cmake/hip_path.cmake deleted file mode 100644 index a9f418cb3bd..00000000000 --- a/cmake/hip_path.cmake +++ /dev/null @@ -1,13 +0,0 @@ -if(NOT DEFINED HIP_PATH) - if(NOT DEFINED ENV{HIP_PATH}) - set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") - set(ENV{HIP_PATH} ${HIP_PATH}) - else() - set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") - endif() -endif() - -# We keep using NVCC/HCC for consistency with previous releases even if AMD -# updated everything to use NVIDIA/AMD in ROCM 4.1 -set(GINKGO_HIP_PLATFORM_NVCC 0) -set(GINKGO_HIP_PLATFORM_HCC 0) From a59e298ac4ca86514a0c35a5f5344b0e1f285e64 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 29 Aug 2024 09:52:13 +0200 Subject: [PATCH 190/448] [cmake] warn on faulty rocm CMake setup --- cmake/autodetect_executors.cmake | 3 +++ cmake/hip.cmake | 30 ++++++---------------- cmake/hip_helpers.cmake | 43 ++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 23 deletions(-) create mode 100644 cmake/hip_helpers.cmake diff --git a/cmake/autodetect_executors.cmake b/cmake/autodetect_executors.cmake index d3ad2e3a6a1..656e5096fc1 100644 --- a/cmake/autodetect_executors.cmake +++ b/cmake/autodetect_executors.cmake @@ -35,6 +35,9 @@ if (NOT DEFINED GINKGO_BUILD_HIP) if(CMAKE_HIP_COMPILER) message(STATUS "Enabling HIP executor") set(GINKGO_HAS_HIP ON) + else () + include(cmake/hip_helpers.cmake) + ginkgo_check_hip_detection_issue() endif() endif() diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 52f377ad6ca..6a05933377f 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -1,4 +1,10 @@ cmake_minimum_required(VERSION 3.21 FATAL_ERROR) + +include(cmake/hip_helpers.cmake) +include(CheckLanguage) +check_language(HIP) +ginkgo_check_hip_detection_issue() + enable_language(HIP) # We keep using NVCC/HCC for consistency with previous releases even if AMD @@ -17,29 +23,7 @@ else() set(GINKGO_HIP_PLATFORM_HCC 1) endif() -find_program( - HIP_HIPCONFIG_EXECUTABLE - NAMES hipconfig - PATHS - "${HIP_ROOT_DIR}" - ENV ROCM_PATH - ENV HIP_PATH - /opt/rocm - /opt/rocm/hip - PATH_SUFFIXES bin - NO_DEFAULT_PATH -) -if(NOT HIP_HIPCONFIG_EXECUTABLE) - # Now search in default paths - find_program(HIP_HIPCONFIG_EXECUTABLE hipconfig) -endif() - -execute_process( - COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --version - OUTPUT_VARIABLE GINKGO_HIP_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE - ERROR_STRIP_TRAILING_WHITESPACE -) +ginkgo_find_hip_version() find_package(hipblas REQUIRED) find_package(hipfft) # optional dependency diff --git a/cmake/hip_helpers.cmake b/cmake/hip_helpers.cmake new file mode 100644 index 00000000000..cf9062bde41 --- /dev/null +++ b/cmake/hip_helpers.cmake @@ -0,0 +1,43 @@ +function(ginkgo_find_hip_version) + find_program( + HIP_HIPCONFIG_EXECUTABLE + NAMES hipconfig + PATHS + "${HIP_ROOT_DIR}" + ENV ROCM_PATH + ENV HIP_PATH + /opt/rocm + /opt/rocm/hip + PATH_SUFFIXES bin + NO_DEFAULT_PATH + ) + if(NOT HIP_HIPCONFIG_EXECUTABLE) + # Now search in default paths + find_program(HIP_HIPCONFIG_EXECUTABLE hipconfig) + endif() + + execute_process( + COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --version + OUTPUT_VARIABLE GINKGO_HIP_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_STRIP_TRAILING_WHITESPACE + ) + set(GINKGO_HIP_VERSION ${GINKGO_HIP_VERSION} PARENT_SCOPE) +endfunction() + +# This function checks if ROCm might not be detected correctly. +# ROCm < 5.7 has a faulty CMake setup that requires setting +# CMAKE_PREFIX_PATH=$ROCM_PATH/lib/cmake, otherwise HIP will not be detected. +function(ginkgo_check_hip_detection_issue) + if(NOT CMAKE_HIP_COMPILER) + ginkgo_find_hip_version() + if (GINKGO_HIP_VERSION AND GINKGO_HIP_VERSION VERSION_LESS 5.7) + message(WARNING + "Could not find a HIP compiler, but HIP version ${GINKGO_HIP_VERSION} was detected through " + "hipconfig. Try setting the environment variable CMAKE_PREFIX_PATH=$ROCM_PATH/lib/cmake, or " + "update to ROCm >= 5.7." + ) + endif () + endif () +endfunction() + From a80b551585ff1883922fb9d9a25ba9db9e1a3fea Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 29 Aug 2024 10:33:45 +0200 Subject: [PATCH 191/448] [cmake] use cmakedefine01 for hip platform --- include/ginkgo/config.hpp.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in index 329918399d6..1dfa6bc61bc 100644 --- a/include/ginkgo/config.hpp.in +++ b/include/ginkgo/config.hpp.in @@ -41,10 +41,10 @@ /* What is HIP compiled for, hcc or nvcc? */ // clang-format off -#define GINKGO_HIP_PLATFORM_HCC @GINKGO_HIP_PLATFORM_HCC@ +#cmakedefine01 GINKGO_HIP_PLATFORM_HCC -#define GINKGO_HIP_PLATFORM_NVCC @GINKGO_HIP_PLATFORM_NVCC@ +#cmakedefine01 GINKGO_HIP_PLATFORM_NVCC // clang-format on From ad717eae6a9eee975d8011fdd5405e0f17bf112f Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 15 Oct 2024 06:34:51 +0000 Subject: [PATCH 192/448] fixup! [cmake] remove maipulation of HIP|ROCM_PATH --- cmake/GinkgoConfig.cmake.in | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index 1f12251f93d..a5ead102c23 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -135,7 +135,7 @@ set(GINKGO_HAVE_VTUNE "@GINKGO_HAVE_VTUNE@") set(GINKGO_HAVE_METIS "@GINKGO_HAVE_METIS@") set_and_check(VTune_PATH "@VTune_PATH@") -# ensure Threads settings +# ensure Threads settings set(THREADS_PREFER_PTHREAD_FLAG ON) # NOTE: we do not export benchmarks, examples, tests or devel tools @@ -176,7 +176,6 @@ if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_HIP) find_dependency(hipsparse) find_dependency(rocrand) find_dependency(rocthrust) - set_and_check(ROCTRACER_PATH "@ROCTRACER_PATH@") find_dependency(ROCTX) endif() From 1bc2ec10460163a0720a30dc3421a3c78a68ae99 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 15 Oct 2024 07:32:31 +0000 Subject: [PATCH 193/448] [cmake] use ROCM_PATH for finding roctx --- cmake/Modules/FindROCTX.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/Modules/FindROCTX.cmake b/cmake/Modules/FindROCTX.cmake index e5647080ca3..1bcb344c6d9 100644 --- a/cmake/Modules/FindROCTX.cmake +++ b/cmake/Modules/FindROCTX.cmake @@ -26,11 +26,11 @@ # ``ROCTX_FOUND`` # If false, do not try to use the ROCTX library. -find_path(ROCTX_INCLUDE_DIR NAMES roctx.h HINTS ${ROCTRACER_PATH}/include) +find_path(ROCTX_INCLUDE_DIR NAMES roctx.h HINTS ${ROCTRACER_PATH}/include ${ROCM_PATH}/include/roctracer) mark_as_advanced(ROCTX_INCLUDE_DIR) if(NOT ROCTX_LIBRARY) - find_library(ROCTX_LIBRARY NAMES roctx64 HINTS ${ROCTRACER_PATH}/lib) + find_library(ROCTX_LIBRARY NAMES roctx64 HINTS ${ROCTRACER_PATH}/lib ${ROCM_PATH}/lib) endif() include(FindPackageHandleStandardArgs) From 4f79e370d6fff56b6c4c68b18fff446e2ed0a2e1 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 15 Oct 2024 07:59:48 +0000 Subject: [PATCH 194/448] [dist] use xstd::void_t again in is_matrix_type_builder This works around a (likely) GCC 7.5 bug when using std::void_t. --- include/ginkgo/core/base/std_extensions.hpp | 18 +++++++++++++++++- include/ginkgo/core/distributed/matrix.hpp | 3 ++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/include/ginkgo/core/base/std_extensions.hpp b/include/ginkgo/core/base/std_extensions.hpp index 893b2b0d865..a950fcc2003 100644 --- a/include/ginkgo/core/base/std_extensions.hpp +++ b/include/ginkgo/core/base/std_extensions.hpp @@ -27,8 +27,24 @@ namespace gko { * @ingroup xstd */ namespace xstd { +namespace detail { + + +template +struct make_void { + using type = void; +}; + + +} // namespace detail + + +/** + * Use the custom implementation, since the std::void_t used in + * is_matrix_type_builder seems to trigger a compiler bug in GCC 7.5. + */ template -using void_t = std::void_t; +using void_t = typename detail::make_void::type; GKO_DEPRECATED("use std::uncaught_exceptions") diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index 1e5e33581a9..de719bb9315 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -55,7 +56,7 @@ struct is_matrix_type_builder : std::false_type {}; template struct is_matrix_type_builder< Builder, ValueType, IndexType, - std::void_t< + xstd::void_t< decltype(std::declval().template create( std::declval>()))>> : std::true_type {}; From 14c5610983c81251aa8ccdd248bb5976deb9b7a5 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 16 Oct 2024 09:55:30 +0000 Subject: [PATCH 195/448] [test] fix compiler error with nvhpc --- test/mpi/matrix.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index 1c090b6c43f..f4b8af2fb19 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -207,6 +207,8 @@ TYPED_TEST(MatrixCreation, BuildFromExistingData) using Partition = typename TestFixture::Partition; using local_index_type = typename TestFixture::local_index_type; using matrix_data = gko::matrix_data; + using input_triple = + gko::detail::input_triple; using dist_mtx_type = typename TestFixture::dist_mtx_type; using dist_vec_type = gko::experimental::distributed::Vector; using comm_index_type = gko::experimental::distributed::comm_index_type; @@ -214,18 +216,16 @@ TYPED_TEST(MatrixCreation, BuildFromExistingData) I> res_local[] = {{{2, 0}, {0, 0}}, {{0, 5}, {0, 0}}, {{0}}}; std::array, 3> size_local{{{2, 2}, {2, 2}, {1, 1}}}; std::array dist_input_local{ - {{size_local[0], {{0, 0, 2}}}, - {size_local[1], {{0, 1, 5}}}, - {size_local[2], - std::initializer_list< - gko::detail::input_triple>{}}}}; + {{size_local[0], I{{0, 0, 2}}}, + {size_local[1], I{{0, 1, 5}}}, + {size_local[2]}}}; I> res_non_local[] = { {{1, 0}, {3, 4}}, {{0, 0, 6}, {8, 7, 0}}, {{10, 9}}}; std::array, 3> size_non_local{{{2, 2}, {2, 3}, {1, 2}}}; std::array dist_input_non_local{ - {{size_non_local[0], {{0, 0, 1}, {1, 0, 3}, {1, 1, 4}}}, - {size_non_local[1], {{0, 2, 6}, {1, 0, 8}, {1, 1, 7}}}, - {size_non_local[2], {{0, 0, 10}, {0, 1, 9}}}}}; + {{size_non_local[0], I{{0, 0, 1}, {1, 0, 3}, {1, 1, 4}}}, + {size_non_local[1], I{{0, 2, 6}, {1, 0, 8}, {1, 1, 7}}}, + {size_non_local[2], I{{0, 0, 10}, {0, 1, 9}}}}}; std::array, 3> recv_sizes{ {{0, 1, 1}, {2, 0, 1}, {1, 1, 0}}}; std::array, 3> recv_offsets{ From a25c9d64b706bd34356427b208d201e3a864f567 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 15 Oct 2024 07:38:32 +0000 Subject: [PATCH 196/448] [core] use only `constexpr` for math functions --- include/ginkgo/core/base/math.hpp | 172 ++++-------------------------- 1 file changed, 22 insertions(+), 150 deletions(-) diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index f6847743717..33b3a566b37 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -283,7 +283,7 @@ using is_complex_s = detail::is_complex_impl; * @return `true` if T is a complex type, `false` otherwise */ template -GKO_INLINE GKO_ATTRIBUTES constexpr bool is_complex() +GKO_INLINE constexpr bool is_complex() { return detail::is_complex_impl::value; } @@ -307,7 +307,7 @@ using is_complex_or_scalar_s = detail::is_complex_or_scalar_impl; * @return `true` if T is a complex/scalar type, `false` otherwise */ template -GKO_INLINE GKO_ATTRIBUTES constexpr bool is_complex_or_scalar() +GKO_INLINE constexpr bool is_complex_or_scalar() { return detail::is_complex_or_scalar_impl::value; } @@ -511,7 +511,7 @@ using highest_precision = * @return the rounded down value */ template -GKO_INLINE GKO_ATTRIBUTES constexpr reduce_precision round_down(T val) +GKO_INLINE constexpr reduce_precision round_down(T val) { return static_cast>(val); } @@ -527,7 +527,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr reduce_precision round_down(T val) * @return the rounded up value */ template -GKO_INLINE GKO_ATTRIBUTES constexpr increase_precision round_up(T val) +GKO_INLINE constexpr increase_precision round_up(T val) { return static_cast>(val); } @@ -609,141 +609,19 @@ struct default_converter { * * @return returns the ceiled quotient. */ -GKO_INLINE GKO_ATTRIBUTES constexpr int64 ceildiv(int64 num, int64 den) +GKO_INLINE constexpr int64 ceildiv(int64 num, int64 den) { return (num + den - 1) / den; } -#if defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC - - -/** - * Returns the additive identity for T. - * - * @return additive identity for T - */ -template -GKO_INLINE __host__ constexpr T zero() -{ - return T{}; -} - - -/** - * Returns the additive identity for T. - * - * @return additive identity for T - * - * @note This version takes an unused reference argument to avoid - * complicated calls like `zero()`. Instead, it allows - * `zero(x)`. - */ -template -GKO_INLINE __host__ constexpr T zero(const T&) -{ - return zero(); -} - - -/** - * Returns the multiplicative identity for T. - * - * @return the multiplicative identity for T - */ -template -GKO_INLINE __host__ constexpr T one() -{ - return T(1); -} - - -/** - * Returns the multiplicative identity for T. - * - * @return the multiplicative identity for T - * - * @note This version takes an unused reference argument to avoid - * complicated calls like `one()`. Instead, it allows - * `one(x)`. - */ -template -GKO_INLINE __host__ constexpr T one(const T&) -{ - return one(); -} - - -/** - * Returns the additive identity for T. - * - * @return additive identity for T - */ -template -GKO_INLINE __device__ constexpr std::enable_if_t< - !std::is_same>>::value, T> -zero() -{ - return T{}; -} - - -/** - * Returns the additive identity for T. - * - * @return additive identity for T - * - * @note This version takes an unused reference argument to avoid - * complicated calls like `zero()`. Instead, it allows - * `zero(x)`. - */ -template -GKO_INLINE __device__ constexpr T zero(const T&) -{ - return zero(); -} - - -/** - * Returns the multiplicative identity for T. - * - * @return the multiplicative identity for T - */ -template -GKO_INLINE __device__ constexpr std::enable_if_t< - !std::is_same>>::value, T> -one() -{ - return T(1); -} - - -/** - * Returns the multiplicative identity for T. - * - * @return the multiplicative identity for T - * - * @note This version takes an unused reference argument to avoid - * complicated calls like `one()`. Instead, it allows - * `one(x)`. - */ -template -GKO_INLINE __device__ constexpr T one(const T&) -{ - return one(); -} - - -#else - - /** * Returns the additive identity for T. * * @return additive identity for T */ template -GKO_INLINE GKO_ATTRIBUTES constexpr T zero() +GKO_INLINE constexpr T zero() { return T{}; } @@ -759,7 +637,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T zero() * `zero(x)`. */ template -GKO_INLINE GKO_ATTRIBUTES constexpr T zero(const T&) +GKO_INLINE constexpr T zero(const T&) { return zero(); } @@ -771,7 +649,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T zero(const T&) * @return the multiplicative identity for T */ template -GKO_INLINE GKO_ATTRIBUTES constexpr T one() +GKO_INLINE constexpr T one() { return T(1); } @@ -787,15 +665,12 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T one() * `one(x)`. */ template -GKO_INLINE GKO_ATTRIBUTES constexpr T one(const T&) +GKO_INLINE constexpr T one(const T&) { return one(); } -#endif // defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC - - #undef GKO_BIND_ZERO_ONE @@ -808,7 +683,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T one(const T&) * @return true iff the given value is zero, i.e. `value == zero()` */ template -GKO_INLINE GKO_ATTRIBUTES constexpr bool is_zero(T value) +GKO_INLINE constexpr bool is_zero(T value) { return value == zero(); } @@ -823,7 +698,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr bool is_zero(T value) * @return true iff the given value is not zero, i.e. `value != zero()` */ template -GKO_INLINE GKO_ATTRIBUTES constexpr bool is_nonzero(T value) +GKO_INLINE constexpr bool is_nonzero(T value) { return value != zero(); } @@ -841,7 +716,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr bool is_nonzero(T value) * */ template -GKO_INLINE GKO_ATTRIBUTES constexpr T max(const T& x, const T& y) +GKO_INLINE constexpr T max(const T& x, const T& y) { return x >= y ? x : y; } @@ -859,7 +734,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T max(const T& x, const T& y) * */ template -GKO_INLINE GKO_ATTRIBUTES constexpr T min(const T& x, const T& y) +GKO_INLINE constexpr T min(const T& x, const T& y) { return x <= y ? x : y; } @@ -1053,7 +928,7 @@ GKO_ATTRIBUTES GKO_INLINE constexpr auto conj(const T& x) * @return The squared norm of the object. */ template -GKO_INLINE GKO_ATTRIBUTES constexpr auto squared_norm(const T& x) +GKO_INLINE constexpr auto squared_norm(const T& x) -> decltype(real(conj(x) * x)) { return real(conj(x) * x); @@ -1070,16 +945,15 @@ GKO_INLINE GKO_ATTRIBUTES constexpr auto squared_norm(const T& x) * @return x >= zero() ? x : -x; */ template -GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t::value, T> -abs(const T& x) +GKO_INLINE constexpr std::enable_if_t::value, T> abs( + const T& x) { return x >= zero() ? x : -x; } template -GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t::value, - remove_complex> +GKO_INLINE constexpr std::enable_if_t::value, remove_complex> abs(const T& x) { return sqrt(squared_norm(x)); @@ -1092,7 +966,7 @@ abs(const T& x) * @tparam T the value type to return */ template -GKO_INLINE GKO_ATTRIBUTES constexpr T pi() +GKO_INLINE constexpr T pi() { return static_cast(3.1415926535897932384626433); } @@ -1107,8 +981,8 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T pi() * @tparam T the corresponding real value type. */ template -GKO_INLINE GKO_ATTRIBUTES constexpr std::complex> unit_root( - int64 n, int64 k = 1) +GKO_INLINE constexpr std::complex> unit_root(int64 n, + int64 k = 1) { return std::polar(one>(), remove_complex{2} * pi>() * k / n); @@ -1259,8 +1133,7 @@ GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> is_nan( * @return NaN. */ template -GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t::value, T> -nan() +GKO_INLINE constexpr std::enable_if_t::value, T> nan() { return std::numeric_limits::quiet_NaN(); } @@ -1274,8 +1147,7 @@ nan() * @return complex{NaN, NaN}. */ template -GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t::value, T> -nan() +GKO_INLINE constexpr std::enable_if_t::value, T> nan() { return T{nan>(), nan>()}; } From 8381a24ea5e614f7585063206b0e2850fed0e17f Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 15 Oct 2024 07:38:51 +0000 Subject: [PATCH 197/448] [core] remove unused #undef --- include/ginkgo/core/base/math.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 33b3a566b37..cd5e489b95d 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -671,9 +671,6 @@ GKO_INLINE constexpr T one(const T&) } -#undef GKO_BIND_ZERO_ONE - - /** * Returns true if and only if the given value is zero. * From 4b48ecb18e7f8bbecf69f63c769031d624007d5a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 21 Oct 2024 14:00:23 +0200 Subject: [PATCH 198/448] temporarily disable oneAPI CI jobs The runners are down for a short while --- .gitlab-ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 18771d9bc2d..cc67883c4b3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -344,6 +344,7 @@ build/icpx20231/igpu/release/shared: - .build_and_test_template - .default_variables - .quick_test_condition + - .disable_job_condition - .use_gko-oneapi20231-igpu variables: CXX_COMPILER: "icpx" @@ -377,6 +378,7 @@ build/icpx/igpu/release/static: - .build_and_test_template - .default_variables - .full_test_condition + - .disable_job_condition - .use_gko-oneapi-igpu variables: CXX_COMPILER: "dpcpp" From 068c9ebac84c25f7fd9b0ae3f790aa083bcb5e6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= Date: Mon, 8 Apr 2024 18:34:10 +0200 Subject: [PATCH 199/448] Simplify the testing types Add template type functions to combine and merge type lists. As a result, the GINKGO_DPCPP_SINGLE_MODE only needs to be present once. Additionally, change the typen name ValueAndIndexType to ComplexAndPODTypes (because gko::size_type is not an IndexType). --- core/test/base/array.cpp | 2 +- core/test/base/iterator_factory.cpp | 2 +- core/test/utils.hpp | 262 ++++++++++-------- core/test/utils/CMakeLists.txt | 1 + core/test/utils/utils_test.cpp | 236 ++++++++++++++++ cuda/test/base/array.cpp | 2 +- reference/test/base/array.cpp | 2 +- .../test/components/fill_array_kernels.cpp | 2 +- .../test/components/reduce_array_kernels.cpp | 2 +- test/components/fill_array_kernels.cpp | 2 +- test/components/reduce_array_kernels.cpp | 2 +- 11 files changed, 387 insertions(+), 128 deletions(-) create mode 100644 core/test/utils/utils_test.cpp diff --git a/core/test/base/array.cpp b/core/test/base/array.cpp index 71816f690ce..f7e03855d06 100644 --- a/core/test/base/array.cpp +++ b/core/test/base/array.cpp @@ -40,7 +40,7 @@ class Array : public ::testing::Test { gko::array x; }; -TYPED_TEST_SUITE(Array, gko::test::ValueAndIndexTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Array, gko::test::ComplexAndPODTypes, TypenameNameGenerator); TYPED_TEST(Array, CanBeCreatedWithoutAnExecutor) diff --git a/core/test/base/iterator_factory.cpp b/core/test/base/iterator_factory.cpp index c4dc30bf219..bbc3bbfd04f 100644 --- a/core/test/base/iterator_factory.cpp +++ b/core/test/base/iterator_factory.cpp @@ -366,7 +366,7 @@ class PermuteIterator : public ::testing::Test { using value_type = ValueType; }; -TYPED_TEST_SUITE(PermuteIterator, gko::test::ValueAndIndexTypes, +TYPED_TEST_SUITE(PermuteIterator, gko::test::ComplexAndPODTypes, TypenameNameGenerator); diff --git a/core/test/utils.hpp b/core/test/utils.hpp index 43ded30cde5..d711e6310e3 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -29,154 +29,176 @@ namespace gko { namespace test { +namespace detail { + + +template +struct cartesian_type_product {}; + +template